From 63cfc8725215baea94fc2400f4d98ff96caf5ec2 Mon Sep 17 00:00:00 2001
From: Menci <mencici@msn.com>
Date: Sat, 20 Jun 2026 03:57:00 +0800
Subject: [PATCH 1/6] feat(protocols,gateway): tier-aware pricing overlay
 (ModelPricing.tiers + TokenUsage.tier + resolveEffectivePricing)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Captures the per-service-tier dimension that bare ModelPricing misses:
distinct service tiers for the same model (Anthropic standard / fast, OpenAI
default / priority / flex / scale) are priced at different rates, and the
gateway needs to surface that distinction in the cost aggregate.

- `ModelPricing.tiers` carries per-service-tier overrides, keyed by the
  wire-value the upstream stamps on the usage object. `resolveEffectivePricing`
  folds a tier override into a flat snapshot before any unit-price lookup,
  so every downstream `unitPriceForDimension` call sees one self-contained map.
- `UsageRecord` and `TokenUsage` grow a `tier` slot; the usage tables key
  buckets on (keyId, model, upstream, modelKey, hour, tier) so distinct tiers
  aggregate as separate buckets with distinct unit-price snapshots. Existing
  rows with `tier = NULL` keep computing identically to before (the resolver
  treats null as base pricing and returns the snapshot sans the `tiers` key).
- `recordTokenUsage` threads the tier from the parsed `TokenUsage` onto
  the bucket so cost compute applies the right override; `tokenUsage`'s
  zero-dimension filter passes `tier` through verbatim.
- Control-plane export / import surfaces the tier alongside the other
  bucket-identity fields; missing tier defaults to null on import.
- Provider config parsers iterate `BILLING_DIMENSIONS` directly instead of
  a hand-rolled `keyof ModelPricing` list — the latter would now include
  `tiers` and admit a non-numeric value into `pricing[dimension]`.

Schema: the SQL repo writes the tier column directly; depends on the sibling
migration adding `tier` to `usage` + `usage_requests`.
---
 packages/gateway/src/app-control_test.ts      |  5 ++
 .../src/control-plane/data-transfer/routes.ts |  7 +++
 .../data-transfer/routes_test.ts              |  2 +
 .../control-plane/token-usage/aggregate.ts    |  5 +-
 .../token-usage/aggregate_test.ts             | 28 ++++++++++
 .../control-plane/token-usage/routes_test.ts  |  1 +
 .../src/data-plane/shared/telemetry/usage.ts  | 16 +++++-
 packages/gateway/src/repo/memory.ts           | 13 +++--
 packages/gateway/src/repo/sql.ts              | 48 ++++++++--------
 packages/gateway/src/repo/types.ts            | 33 ++++++++---
 packages/protocols/src/common/models.ts       | 24 +++++++-
 packages/protocols/src/common/models_test.ts  | 56 ++++++++++++++++++-
 12 files changed, 198 insertions(+), 40 deletions(-)

diff --git a/packages/gateway/src/app-control_test.ts b/packages/gateway/src/app-control_test.ts
index e97456c4c..dec01ede8 100644
--- a/packages/gateway/src/app-control_test.ts
+++ b/packages/gateway/src/app-control_test.ts
@@ -108,6 +108,7 @@ test('/api/token-usage scopes to the actor\'s keys when called with an API key',
     upstream: null,
     modelKey: 'claude-sonnet-4',
     hour: '2026-03-15T10',
+    tier: null,
     requests: 2,
     tokens: { input: 10, output: 5, input_cache_read: 4, input_cache_write: 1 },
     cost: null,
@@ -118,6 +119,7 @@ test('/api/token-usage scopes to the actor\'s keys when called with an API key',
     upstream: null,
     modelKey: 'gpt-5',
     hour: '2026-03-15T11',
+    tier: null,
     requests: 1,
     tokens: { input: 20, output: 8, input_cache_read: 6, input_cache_write: 2 },
     cost: null,
@@ -155,6 +157,7 @@ test('/api/token-usage in self-by-key mode includes per-key metadata for the act
     upstream: null,
     modelKey: 'gpt-5',
     hour: '2026-03-16T10',
+    tier: null,
     requests: 1,
     tokens: { input: 20, output: 8 },
     cost: null,
@@ -182,6 +185,7 @@ test('/api/token-usage all-by-user view aggregates across keys per user', async
     upstream: null,
     modelKey: 'gpt-5',
     hour: '2026-03-15T10',
+    tier: null,
     requests: 1,
     tokens: { input: 10, output: 5 },
     cost: null,
@@ -213,6 +217,7 @@ test('/api/token-usage merges Claude variants into backend base model records',
     keyId: apiKey.id,
     hour: '2026-03-17T10',
     upstream: 'copilot:1',
+    tier: null,
     requests: 1,
     tokens: { input: 10, output: 5, input_cache_read: 2, input_cache_write: 1 },
   };
diff --git a/packages/gateway/src/control-plane/data-transfer/routes.ts b/packages/gateway/src/control-plane/data-transfer/routes.ts
index 0a57f7781..780a0318d 100644
--- a/packages/gateway/src/control-plane/data-transfer/routes.ts
+++ b/packages/gateway/src/control-plane/data-transfer/routes.ts
@@ -402,6 +402,12 @@ const parseUsageRecords = (value: unknown): { type: 'ok'; records: UsageRecord[]
     if (typeof record.upstream === 'string' && isLegacyUpstreamIdentity(record.upstream)) {
       return { type: 'invalid', index: i, error: 'upstream must use a raw upstream id, not a legacy provider-prefixed identity' };
     }
+    if (record.tier !== undefined && record.tier !== null && typeof record.tier !== 'string') {
+      return { type: 'invalid', index: i, error: 'record has invalid tier (must be a string or null)' };
+    }
+    // `tier` is absent on exports taken before the column existed; collapse
+    // the absent and explicit-null cases into the same wire value.
+    const tier: string | null = typeof record.tier === 'string' ? record.tier : null;
     const tokensResult = parseImportedTokens(record.tokens);
     if (tokensResult.type === 'invalid') return { type: 'invalid', index: i, error: 'record has invalid token dimension counts' };
     const costResult = parseImportedCost(record.cost);
@@ -412,6 +418,7 @@ const parseUsageRecords = (value: unknown): { type: 'ok'; records: UsageRecord[]
       upstream: record.upstream as string | null,
       modelKey: record.modelKey,
       hour: record.hour,
+      tier,
       requests: record.requests,
       tokens: tokensResult.tokens,
       cost: costResult.cost,
diff --git a/packages/gateway/src/control-plane/data-transfer/routes_test.ts b/packages/gateway/src/control-plane/data-transfer/routes_test.ts
index 70f108c03..d43d7a975 100644
--- a/packages/gateway/src/control-plane/data-transfer/routes_test.ts
+++ b/packages/gateway/src/control-plane/data-transfer/routes_test.ts
@@ -177,6 +177,7 @@ const USAGE_1: UsageRecord = {
   upstream: 'up_copilot_a',
   modelKey: 'claude-opus-4.7',
   hour: '2026-01-01T10',
+  tier: null,
   requests: 5,
   tokens: { input: 1000, output: 500, input_cache_read: 120, input_cache_write: 80 },
   cost: null,
@@ -188,6 +189,7 @@ const USAGE_2: UsageRecord = {
   upstream: 'up_azure_a',
   modelKey: 'gpt-prod',
   hour: '2026-01-01T11',
+  tier: null,
   requests: 3,
   tokens: { input: 2000, output: 800, input_cache_read: 200, input_cache_write: 50 },
   cost: null,
diff --git a/packages/gateway/src/control-plane/token-usage/aggregate.ts b/packages/gateway/src/control-plane/token-usage/aggregate.ts
index 2ba67ff74..3b0009805 100644
--- a/packages/gateway/src/control-plane/token-usage/aggregate.ts
+++ b/packages/gateway/src/control-plane/token-usage/aggregate.ts
@@ -22,7 +22,10 @@ export interface DisplayUsageByUserRecord {
 
 // Cost is pure addition over the dimension rows: Σ tokens × unit_price / 1e6.
 // No subtraction is needed because the counts are disjoint and each dimension
-// already carries its own resolved unit price snapshot.
+// already carries its own resolved unit price snapshot. `record.cost` here
+// is the per-row reconstruction of the per-dimension `unit_price` columns
+// the repo writer already folded the bucket's tier into — so the dimension
+// lookup is a direct hit, no tier resolution needed at read time.
 const recordCostUsd = (record: UsageRecord): number => {
   let total = 0;
   for (const dimension of BILLING_DIMENSIONS) {
diff --git a/packages/gateway/src/control-plane/token-usage/aggregate_test.ts b/packages/gateway/src/control-plane/token-usage/aggregate_test.ts
index 04d62e46a..b16cbac66 100644
--- a/packages/gateway/src/control-plane/token-usage/aggregate_test.ts
+++ b/packages/gateway/src/control-plane/token-usage/aggregate_test.ts
@@ -14,6 +14,7 @@ const baseRecord = (overrides: Partial<UsageRecord>): UsageRecord => ({
   model: 'claude-opus-4-7',
   upstream: 'up_copilot',
   modelKey: 'claude-opus-4-7',
+  tier: null,
   requests: 1,
   tokens: { input: 100, output: 50 },
   cost: opus47Pricing,
@@ -83,3 +84,30 @@ test('aggregateUsageForDisplay charges image dimensions separately', () => {
   // 10 + 5 + 40 + 30 = $85.
   assertAlmostEquals(out[0].cost, 85, 1e-9);
 });
+
+test('aggregateUsageForDisplay reads unit prices from the already-folded cost the repo writer hands back', () => {
+  // The repo write path (`repo/sql.ts:dimensionRows`, `repo/memory.ts:dimensionEntries`)
+  // resolves the bucket's tier into per-dimension unit prices BEFORE storing,
+  // so by the time aggregate sees a UsageRecord the `cost` field is already
+  // the effective pricing for that bucket's tier and tier resolution is a
+  // no-op. Two same-tier records below model the post-write shape.
+  // Opus 4.8: standard $5 / $25, fast $10 / $50.
+  const fastRow = baseRecord({
+    tier: 'fast',
+    cost: { input: 10, output: 50 },
+    tokens: { input: 1_000_000, output: 1_000_000 },
+  });
+  const standardRow = baseRecord({
+    tier: null,
+    cost: { input: 5, output: 25 },
+    tokens: { input: 1_000_000, output: 1_000_000 },
+  });
+
+  const fastOut = aggregateUsageForDisplay([fastRow]);
+  // 1M * $10 + 1M * $50 = $60.
+  assertAlmostEquals(fastOut[0].cost, 60, 1e-9);
+
+  const standardOut = aggregateUsageForDisplay([standardRow]);
+  // 1M * $5 + 1M * $25 = $30.
+  assertAlmostEquals(standardOut[0].cost, 30, 1e-9);
+});
diff --git a/packages/gateway/src/control-plane/token-usage/routes_test.ts b/packages/gateway/src/control-plane/token-usage/routes_test.ts
index b911e2011..51c5ed85e 100644
--- a/packages/gateway/src/control-plane/token-usage/routes_test.ts
+++ b/packages/gateway/src/control-plane/token-usage/routes_test.ts
@@ -16,6 +16,7 @@ const seedUsage = async (
     upstream: 'up_test',
     modelKey: model,
     hour,
+    tier: null,
     requests,
     tokens: { input: 100, output: 50 },
     cost: null,
diff --git a/packages/gateway/src/data-plane/shared/telemetry/usage.ts b/packages/gateway/src/data-plane/shared/telemetry/usage.ts
index 99f232f47..7b32e8ad8 100644
--- a/packages/gateway/src/data-plane/shared/telemetry/usage.ts
+++ b/packages/gateway/src/data-plane/shared/telemetry/usage.ts
@@ -6,14 +6,27 @@ import type { TelemetryModelIdentity } from '@floway-dev/provider';
 
 export const hasTokenUsage = (usage: TokenUsage): boolean => BILLING_DIMENSIONS.some(dimension => (usage[dimension] ?? 0) > 0);
 
+// Map an upstream-reported service tier onto the tier marker the gateway
+// stores on the usage row. `default` and `auto` (OpenAI's response-side base
+// values) and `standard` (Anthropic's response-side base value) all denote
+// base pricing and collapse to null so they aggregate with rows that carry
+// no tier at all.
+// https://developers.openai.com/api/docs/guides/priority-processing
+// https://docs.claude.com/en/api/service-tiers
+// https://docs.claude.com/en/build-with-claude/fast-mode
+export const billableServiceTier = (tier: string | null | undefined): string | null =>
+  tier != null && tier !== 'default' && tier !== 'auto' && tier !== 'standard' ? tier : null;
+
 // Drop zero / undefined dimensions so a usage map only carries the dimensions
-// actually billed.
+// actually billed. `tier` (a non-numeric service-tier marker) survives the
+// filter so per-tier pricing overrides resolve at recording time.
 export const tokenUsage = (counts: TokenUsage): TokenUsage => {
   const out: TokenUsage = {};
   for (const dimension of BILLING_DIMENSIONS) {
     const value = counts[dimension] ?? 0;
     if (value > 0) out[dimension] = value;
   }
+  if (counts.tier != null) out.tier = counts.tier;
   return out;
 };
 
@@ -82,6 +95,7 @@ export const recordTokenUsage = async (keyId: string, modelIdentity: TelemetryMo
       upstream: modelIdentity.upstream,
       modelKey: modelIdentity.modelKey,
       hour: currentHour(),
+      tier: usage.tier ?? null,
       requests: 1,
       tokens: usage,
       cost: modelIdentity.cost,
diff --git a/packages/gateway/src/repo/memory.ts b/packages/gateway/src/repo/memory.ts
index 81bbd8e45..89088e84d 100644
--- a/packages/gateway/src/repo/memory.ts
+++ b/packages/gateway/src/repo/memory.ts
@@ -43,7 +43,7 @@ import { serializeStoredState } from './upstream-json.ts';
 import { latencyBucketForMs } from '../shared/performance-histogram.ts';
 import { generateSessionToken } from '../shared/session-tokens.ts';
 import { assertWebSearchProviderName } from '../shared/web-search-providers.ts';
-import { BILLING_DIMENSIONS, type BillingDimension, type ModelPricing, unitPriceForDimension } from '@floway-dev/protocols/common';
+import { BILLING_DIMENSIONS, type BillingDimension, type ModelPricing, resolveEffectivePricing, unitPriceForDimension } from '@floway-dev/protocols/common';
 import type { UpstreamModel, UpstreamRecord } from '@floway-dev/provider';
 
 const SEED_ADMIN_USER: User = {
@@ -230,6 +230,7 @@ interface UsageBucketIdentity {
   upstream: string | null;
   modelKey: string;
   hour: string;
+  tier: string | null;
 }
 
 interface UsageBucketState extends UsageBucketIdentity {
@@ -242,13 +243,14 @@ class MemoryUsageRepo implements UsageRepo {
   private store = new Map<string, UsageBucketState>();
 
   private key(r: UsageBucketIdentity): string {
-    return [r.keyId, r.model, r.upstream ?? '', r.modelKey, r.hour].join('\0');
+    return [r.keyId, r.model, r.upstream ?? '', r.modelKey, r.hour, r.tier ?? ''].join('\0');
   }
 
   private dimensionEntries(record: UsageRecord): { dimension: BillingDimension; tokens: number; unitPrice: number | null }[] {
+    const effective = resolveEffectivePricing(record.cost, record.tier);
     return BILLING_DIMENSIONS.flatMap(dimension => {
       const tokens = record.tokens[dimension] ?? 0;
-      return tokens > 0 ? [{ dimension, tokens, unitPrice: unitPriceForDimension(record.cost, dimension) }] : [];
+      return tokens > 0 ? [{ dimension, tokens, unitPrice: unitPriceForDimension(effective, dimension) }] : [];
     });
   }
 
@@ -261,14 +263,14 @@ class MemoryUsageRepo implements UsageRepo {
       const unitPrice = state.unitPrices[dimension];
       if (unitPrice !== undefined) (cost ??= {})[dimension] = unitPrice;
     }
-    return { keyId: state.keyId, model: state.model, upstream: state.upstream ?? null, modelKey: state.modelKey, hour: state.hour, requests: state.requests, tokens, cost };
+    return { keyId: state.keyId, model: state.model, upstream: state.upstream ?? null, modelKey: state.modelKey, hour: state.hour, tier: state.tier, requests: state.requests, tokens, cost };
   }
 
   private bucket(record: UsageRecord): UsageBucketState {
     const k = this.key(record);
     let state = this.store.get(k);
     if (!state) {
-      state = { keyId: record.keyId, model: record.model, upstream: record.upstream ?? null, modelKey: record.modelKey, hour: record.hour, tokens: {}, unitPrices: {}, requests: 0 };
+      state = { keyId: record.keyId, model: record.model, upstream: record.upstream ?? null, modelKey: record.modelKey, hour: record.hour, tier: record.tier, tokens: {}, unitPrices: {}, requests: 0 };
       this.store.set(k, state);
     }
     return state;
@@ -308,6 +310,7 @@ class MemoryUsageRepo implements UsageRepo {
       upstream: record.upstream ?? null,
       modelKey: record.modelKey,
       hour: record.hour,
+      tier: record.tier,
       tokens: {},
       unitPrices: {},
       requests: record.requests,
diff --git a/packages/gateway/src/repo/sql.ts b/packages/gateway/src/repo/sql.ts
index f14d31337..933ca121c 100644
--- a/packages/gateway/src/repo/sql.ts
+++ b/packages/gateway/src/repo/sql.ts
@@ -39,7 +39,7 @@ import { latencyBucketForMs } from '../shared/performance-histogram.ts';
 import { generateSessionToken } from '../shared/session-tokens.ts';
 import { assertWebSearchProviderName } from '../shared/web-search-providers.ts';
 import type { SqlDatabase, SqlPreparedStatement, SqlResult } from '@floway-dev/platform';
-import { BILLING_DIMENSIONS, type BillingDimension, type ModelPricing, unitPriceForDimension } from '@floway-dev/protocols/common';
+import { BILLING_DIMENSIONS, type BillingDimension, type ModelPricing, resolveEffectivePricing, unitPriceForDimension } from '@floway-dev/protocols/common';
 import type { ProxyFallbackEntry, UpstreamModel, UpstreamProviderKind, UpstreamRecord } from '@floway-dev/provider';
 
 const runStatements = async (db: SqlDatabase, statements: SqlPreparedStatement[]): Promise<SqlResult[]> => {
@@ -369,11 +369,13 @@ class SqlSessionsRepo implements SessionsRepo {
   }
 }
 
-const dimensionRows = (record: UsageRecord): { dimension: BillingDimension; tokens: number; unitPrice: number | null }[] =>
-  BILLING_DIMENSIONS.flatMap(dimension => {
+const dimensionRows = (record: UsageRecord): { dimension: BillingDimension; tokens: number; unitPrice: number | null }[] => {
+  const effective = resolveEffectivePricing(record.cost, record.tier);
+  return BILLING_DIMENSIONS.flatMap(dimension => {
     const tokens = record.tokens[dimension] ?? 0;
-    return tokens > 0 ? [{ dimension, tokens, unitPrice: unitPriceForDimension(record.cost, dimension) }] : [];
+    return tokens > 0 ? [{ dimension, tokens, unitPrice: unitPriceForDimension(effective, dimension) }] : [];
   });
+};
 
 class SqlUsageRepo implements UsageRepo {
   constructor(private db: SqlDatabase) {}
@@ -383,19 +385,19 @@ class SqlUsageRepo implements UsageRepo {
     const statements: SqlPreparedStatement[] = dimensionRows(record).map(row =>
       this.db
         .prepare(
-          `INSERT INTO usage (key_id, model, upstream, model_key, hour, dimension, tokens, unit_price) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+          `INSERT INTO usage (key_id, model, upstream, model_key, hour, tier, dimension, tokens, unit_price) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
            ON CONFLICT DO UPDATE SET
              tokens = tokens + excluded.tokens,
              unit_price = COALESCE(unit_price, excluded.unit_price)`,
         )
-        .bind(record.keyId, record.model, upstream, record.modelKey, record.hour, row.dimension, row.tokens, row.unitPrice));
+        .bind(record.keyId, record.model, upstream, record.modelKey, record.hour, record.tier, row.dimension, row.tokens, row.unitPrice));
     statements.push(
       this.db
         .prepare(
-          `INSERT INTO usage_requests (key_id, model, upstream, model_key, hour, requests) VALUES (?, ?, ?, ?, ?, ?)
+          `INSERT INTO usage_requests (key_id, model, upstream, model_key, hour, tier, requests) VALUES (?, ?, ?, ?, ?, ?, ?)
            ON CONFLICT DO UPDATE SET requests = requests + excluded.requests`,
         )
-        .bind(record.keyId, record.model, upstream, record.modelKey, record.hour, record.requests),
+        .bind(record.keyId, record.model, upstream, record.modelKey, record.hour, record.tier, record.requests),
     );
     await runStatements(this.db, statements);
   }
@@ -405,11 +407,11 @@ class SqlUsageRepo implements UsageRepo {
     const binds = opts.keyId ? [opts.keyId, opts.start, opts.end] : [opts.start, opts.end];
     const [{ results: dimensions }, { results: requests }] = await Promise.all([
       this.db
-        .prepare(`SELECT key_id, model, upstream, model_key, hour, dimension, tokens, unit_price FROM usage WHERE ${dimensionWhere}`)
+        .prepare(`SELECT key_id, model, upstream, model_key, hour, tier, dimension, tokens, unit_price FROM usage WHERE ${dimensionWhere}`)
         .bind(...binds)
         .all<UsageDimensionRow>(),
       this.db
-        .prepare(`SELECT key_id, model, upstream, model_key, hour, requests FROM usage_requests WHERE ${dimensionWhere}`)
+        .prepare(`SELECT key_id, model, upstream, model_key, hour, tier, requests FROM usage_requests WHERE ${dimensionWhere}`)
         .bind(...binds)
         .all<UsageRequestRow>(),
     ]);
@@ -418,8 +420,8 @@ class SqlUsageRepo implements UsageRepo {
 
   async listAll(): Promise<UsageRecord[]> {
     const [{ results: dimensions }, { results: requests }] = await Promise.all([
-      this.db.prepare('SELECT key_id, model, upstream, model_key, hour, dimension, tokens, unit_price FROM usage').all<UsageDimensionRow>(),
-      this.db.prepare('SELECT key_id, model, upstream, model_key, hour, requests FROM usage_requests').all<UsageRequestRow>(),
+      this.db.prepare('SELECT key_id, model, upstream, model_key, hour, tier, dimension, tokens, unit_price FROM usage').all<UsageDimensionRow>(),
+      this.db.prepare('SELECT key_id, model, upstream, model_key, hour, tier, requests FROM usage_requests').all<UsageRequestRow>(),
     ]);
     return assembleUsageRecords(dimensions, requests);
   }
@@ -430,20 +432,20 @@ class SqlUsageRepo implements UsageRepo {
     // dimensions absent from the new record do not linger.
     const statements: SqlPreparedStatement[] = [
       this.db
-        .prepare("DELETE FROM usage WHERE key_id = ? AND model = ? AND COALESCE(upstream, '') = COALESCE(?, '') AND model_key = ? AND hour = ?")
-        .bind(record.keyId, record.model, upstream, record.modelKey, record.hour),
+        .prepare("DELETE FROM usage WHERE key_id = ? AND model = ? AND COALESCE(upstream, '') = COALESCE(?, '') AND model_key = ? AND hour = ? AND COALESCE(tier, '') = COALESCE(?, '')")
+        .bind(record.keyId, record.model, upstream, record.modelKey, record.hour, record.tier),
       ...dimensionRows(record).map(row =>
         this.db
-          .prepare('INSERT INTO usage (key_id, model, upstream, model_key, hour, dimension, tokens, unit_price) VALUES (?, ?, ?, ?, ?, ?, ?, ?)')
-          .bind(record.keyId, record.model, upstream, record.modelKey, record.hour, row.dimension, row.tokens, row.unitPrice)),
+          .prepare('INSERT INTO usage (key_id, model, upstream, model_key, hour, tier, dimension, tokens, unit_price) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)')
+          .bind(record.keyId, record.model, upstream, record.modelKey, record.hour, record.tier, row.dimension, row.tokens, row.unitPrice)),
     ];
     statements.push(
       this.db
         .prepare(
-          `INSERT INTO usage_requests (key_id, model, upstream, model_key, hour, requests) VALUES (?, ?, ?, ?, ?, ?)
+          `INSERT INTO usage_requests (key_id, model, upstream, model_key, hour, tier, requests) VALUES (?, ?, ?, ?, ?, ?, ?)
            ON CONFLICT DO UPDATE SET requests = excluded.requests`,
         )
-        .bind(record.keyId, record.model, upstream, record.modelKey, record.hour, record.requests),
+        .bind(record.keyId, record.model, upstream, record.modelKey, record.hour, record.tier, record.requests),
     );
     await runStatements(this.db, statements);
   }
@@ -459,6 +461,7 @@ interface UsageDimensionRow {
   upstream: string | null;
   model_key: string;
   hour: string;
+  tier: string | null;
   dimension: string;
   tokens: number;
   unit_price: number | null;
@@ -470,11 +473,12 @@ interface UsageRequestRow {
   upstream: string | null;
   model_key: string;
   hour: string;
+  tier: string | null;
   requests: number;
 }
 
-const usageBucketKey = (row: { key_id: string; model: string; upstream: string | null; model_key: string; hour: string }): string =>
-  [row.key_id, row.model, row.upstream ?? '', row.model_key, row.hour].join('\0');
+const usageBucketKey = (row: { key_id: string; model: string; upstream: string | null; model_key: string; hour: string; tier: string | null }): string =>
+  [row.key_id, row.model, row.upstream ?? '', row.model_key, row.hour, row.tier ?? ''].join('\0');
 
 // Reassemble per-bucket UsageRecords from the two narrow tables. The dimension
 // rows carry the disjoint counts and the per-dimension unit_price snapshot,
@@ -483,11 +487,11 @@ const usageBucketKey = (row: { key_id: string; model: string; upstream: string |
 const assembleUsageRecords = (dimensions: readonly UsageDimensionRow[], requests: readonly UsageRequestRow[]): UsageRecord[] => {
   const byBucket = new Map<string, UsageRecord>();
 
-  const ensureRecord = (row: { key_id: string; model: string; upstream: string | null; model_key: string; hour: string }): UsageRecord => {
+  const ensureRecord = (row: { key_id: string; model: string; upstream: string | null; model_key: string; hour: string; tier: string | null }): UsageRecord => {
     const key = usageBucketKey(row);
     let record = byBucket.get(key);
     if (!record) {
-      record = { keyId: row.key_id, model: row.model, upstream: row.upstream, modelKey: row.model_key, hour: row.hour, requests: 0, tokens: {}, cost: null };
+      record = { keyId: row.key_id, model: row.model, upstream: row.upstream, modelKey: row.model_key, hour: row.hour, tier: row.tier, requests: 0, tokens: {}, cost: null };
       byBucket.set(key, record);
     }
     return record;
diff --git a/packages/gateway/src/repo/types.ts b/packages/gateway/src/repo/types.ts
index 6c002deba..5df3ef9dd 100644
--- a/packages/gateway/src/repo/types.ts
+++ b/packages/gateway/src/repo/types.ts
@@ -43,19 +43,34 @@ export interface UsageRecord {
   upstream: string | null;
   modelKey: string;
   hour: string;
+  // Service tier the upstream stamped on this bucket (Anthropic `speed`,
+  // OpenAI `service_tier`). null = the base / default tier. Distinct tiers
+  // for the same (keyId, model, upstream, modelKey, hour) are stored as
+  // separate buckets so per-tier pricing overrides apply correctly.
+  tier: string | null;
   requests: number;
-  // Disjoint per-dimension token counts for this bucket (see TokenUsage).
-  tokens: TokenUsage;
+  // Disjoint per-dimension token counts for this bucket. The tier the bucket
+  // was stamped under lives on the `tier` field above — do not encode it
+  // inside this map.
+  tokens: Partial<Record<BillingDimension, number>>;
   // Pricing snapshot taken at write time. null means the provider did not
   // resolve pricing for this model (Custom upstreams, unknown Copilot
   // public id, etc.). The repo derives per-dimension unit prices from it via
-  // unitPriceForDimension; aggregation treats a null snapshot as cost 0.
+  // unitPriceForDimension after `resolveEffectivePricing(cost, tier)` folds
+  // in the bucket's tier override; aggregation treats a null snapshot as
+  // cost 0.
   cost: ModelPricing | null;
 }
 
 // Disjoint per-dimension token counts. Absent keys mean zero for that
-// dimension. No key's count overlaps another's.
-export type TokenUsage = Partial<Record<BillingDimension, number>>;
+// dimension. No key's count overlaps another's. `tier` is the upstream-
+// reported service-tier marker (Anthropic `usage.speed`, OpenAI
+// `usage.service_tier`) that selects an override against `cost.tiers`
+// before any per-dimension unit-price lookup; absent / null = the model's
+// base pricing applies.
+export interface TokenUsage extends Partial<Record<BillingDimension, number>> {
+  tier?: string | null;
+}
 
 export type SearchUsageAction = 'search' | 'fetch_page';
 
@@ -137,10 +152,10 @@ export interface SessionsRepo {
 }
 
 export interface UsageRepo {
-  // Additive upsert: on (keyId, model, upstream, modelKey, hour) conflict,
-  // token counts are summed. cost is COALESCED — the first write within a
-  // bucket establishes the pricing snapshot for that row, later writes that
-  // share the bucket keep the original snapshot.
+  // Additive upsert: on (keyId, model, upstream, modelKey, hour, tier)
+  // conflict, token counts are summed. cost is COALESCED — the first write
+  // within a bucket establishes the pricing snapshot for that row, later
+  // writes that share the bucket keep the original snapshot.
   record(record: UsageRecord): Promise<void>;
   query(opts: { keyId?: string; start: string; end: string }): Promise<UsageRecord[]>;
   listAll(): Promise<UsageRecord[]>;
diff --git a/packages/protocols/src/common/models.ts b/packages/protocols/src/common/models.ts
index a26fe8f7b..896733ffd 100644
--- a/packages/protocols/src/common/models.ts
+++ b/packages/protocols/src/common/models.ts
@@ -23,7 +23,14 @@ export const BILLING_DIMENSIONS: readonly BillingDimension[] = ['input', 'input_
 // Keys are billing dimensions: bare `input`/`output` are the text/fallback rate
 // and `_image` keys are the image modality. Every key is optional; an absent key
 // falls back per `unitPriceForDimension` (modality → bare, cached → uncached).
-export type ModelPricing = Partial<Record<BillingDimension, number>>;
+//
+// `tiers` carries per-request service-tier overrides (Anthropic fast mode,
+// OpenAI priority/flex). Each tier key is the wire-value the upstream stamps
+// on the usage object (`fast`, `priority`, `flex`, ...). Resolve through
+// `resolveEffectivePricing(pricing, usage.tier)` before any unit-price lookup.
+export interface ModelPricing extends Partial<Record<BillingDimension, number>> {
+  tiers?: Record<string, Partial<Record<BillingDimension, number>>>;
+}
 
 // Resolve the USD-per-million-tokens unit price for one dimension against a
 // pricing snapshot, applying the LiteLLM-style fallback chain: a modality with
@@ -52,6 +59,21 @@ export const unitPriceForDimension = (pricing: ModelPricing | null, dimension: B
   }
 };
 
+// Fold the per-tier override (if any) into a flat ModelPricing snapshot, so
+// every downstream `unitPriceForDimension` call sees one self-contained map.
+// Per-dimension shallow merge: overlay keys win, omitted keys inherit the
+// base rate (and then flow through `unitPriceForDimension`'s fallback chain).
+// Returns a fresh object that never carries `tiers` — recursion would not
+// match any real billing surface. An unknown or absent tier returns the base
+// snapshot unchanged (sans `tiers`), so old usage rows with no tier carry on
+// pricing identically to before.
+export const resolveEffectivePricing = (pricing: ModelPricing | null, tier: string | null | undefined): ModelPricing | null => {
+  if (!pricing) return null;
+  const { tiers, ...base } = pricing;
+  const override = tier != null ? tiers?.[tier] : undefined;
+  return override ? { ...base, ...override } : base;
+};
+
 // High-level endpoint-family discriminator. A model belongs to exactly one
 // kind; cross-cutting features (vision, function calling, structured
 // outputs) are orthogonal and modeled separately when needed.
diff --git a/packages/protocols/src/common/models_test.ts b/packages/protocols/src/common/models_test.ts
index ffe10d8c3..e706d4814 100644
--- a/packages/protocols/src/common/models_test.ts
+++ b/packages/protocols/src/common/models_test.ts
@@ -1,6 +1,6 @@
 import { test } from 'vitest';
 
-import { unitPriceForDimension } from './models.ts';
+import { resolveEffectivePricing, unitPriceForDimension, type ModelPricing } from './models.ts';
 import { assertEquals } from '../test-assert.ts';
 
 test('unitPriceForDimension returns null when pricing snapshot is null', () => {
@@ -33,3 +33,57 @@ test('unitPriceForDimension returns null when the fallback chain is empty', () =
   assertEquals(unitPriceForDimension({}, 'input_cache_write_1h'), null);
   assertEquals(unitPriceForDimension({ output: 5 }, 'input_cache_write_1h'), null);
 });
+
+test('resolveEffectivePricing merges a tier override into the base snapshot and strips tiers', () => {
+  const base: ModelPricing = {
+    input: 5,
+    input_cache_read: 0.5,
+    input_cache_write: 6.25,
+    output: 25,
+    tiers: { fast: { input: 30, output: 150, input_cache_write: 60 } },
+  };
+  const effective = resolveEffectivePricing(base, 'fast');
+  assertEquals(effective, {
+    input: 30,
+    input_cache_read: 0.5,
+    input_cache_write: 60,
+    output: 150,
+  });
+});
+
+test('resolveEffectivePricing shallow-merges per dimension — omitted overlay keys inherit the base rate', () => {
+  // The codex flex/priority overlays exploit this: they declare only the
+  // input/output/cache-read dimensions that differ at the tier and leave
+  // cache-write (and any 1h/image dimension) to inherit base.
+  const base: ModelPricing = {
+    input: 5,
+    input_cache_read: 0.5,
+    input_cache_write: 6.25,
+    output: 25,
+    tiers: { flex: { input: 2.5 } },
+  };
+  assertEquals(resolveEffectivePricing(base, 'flex'), {
+    input: 2.5,
+    input_cache_read: 0.5,
+    input_cache_write: 6.25,
+    output: 25,
+  });
+});
+
+test('resolveEffectivePricing returns the base snapshot (sans tiers) when tier is unknown or absent', () => {
+  const base: ModelPricing = {
+    input: 5,
+    output: 25,
+    tiers: { fast: { input: 30 } },
+  };
+  const expected: ModelPricing = { input: 5, output: 25 };
+
+  assertEquals(resolveEffectivePricing(base, null), expected);
+  assertEquals(resolveEffectivePricing(base, undefined), expected);
+  assertEquals(resolveEffectivePricing(base, 'priority'), expected);
+});
+
+test('resolveEffectivePricing returns null when the base snapshot is null', () => {
+  assertEquals(resolveEffectivePricing(null, 'fast'), null);
+  assertEquals(resolveEffectivePricing(null, null), null);
+});

From 5e8aa8ce6177ea00c0409d62faef2d33eaf8f2cc Mon Sep 17 00:00:00 2001
From: Menci <mencici@msn.com>
Date: Sat, 20 Jun 2026 04:02:17 +0800
Subject: [PATCH 2/6] feat(gateway): parse usage.speed / usage.service_tier
 into TokenUsage.tier across protocol shapes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reads each upstream's service-tier marker off the usage object and stamps it
onto TokenUsage.tier so the recording layer routes the bucket through the
right tier override:

- Messages: Opus 4.6+ emits `usage.speed: 'standard' | 'fast'`; only `fast`
  surfaces as `tier: 'fast'`. Standard is left unset so base-tier rows
  aggregate with the historical no-tier rows. Streamed deltas propagate
  `speed` so a late delta carries the tier all the way to message_stop.
- Responses: the top-level `response.service_tier` echoes the actual
  processing tier ('priority', 'flex', 'scale', 'default', 'auto'). We drop
  'default' and 'auto' — both denote base pricing — and surface anything
  else verbatim. The WebSocket path reads service_tier the same way as HTTP.
- Chat Completions: same as Responses but reading the top-level
  `chunk.service_tier` (chat.completion[.chunk]).

Protocol types grow `MessagesUsage.speed`, `ResponsesResult.service_tier`,
`ChatCompletionsResult.service_tier`, and `ChatCompletionsStreamEvent.service_tier`.
---
 .../llm/chat-completions/respond.ts           |  23 +-
 .../data-plane/llm/chat-completions/usage.ts  |  19 ++
 .../llm/chat-completions/usage_test.ts        |  76 +++++
 .../llm/messages/events/reassemble.ts         |   1 +
 .../src/data-plane/llm/messages/respond.ts    |  33 +-
 .../data-plane/llm/messages/respond_test.ts   | 283 ++++++++++++++++++
 .../src/data-plane/llm/responses/respond.ts   |  19 +-
 .../src/data-plane/llm/responses/usage.ts     |  21 ++
 .../data-plane/llm/responses/usage_test.ts    |  81 +++++
 .../src/data-plane/llm/responses/websocket.ts |  13 +-
 .../src/data-plane/shared/telemetry/usage.ts  |  23 +-
 .../protocols/src/chat-completions/index.ts   |   5 +-
 packages/protocols/src/messages/index.ts      |   7 +-
 packages/protocols/src/responses/index.ts     |   6 +-
 14 files changed, 547 insertions(+), 63 deletions(-)
 create mode 100644 packages/gateway/src/data-plane/llm/chat-completions/usage.ts
 create mode 100644 packages/gateway/src/data-plane/llm/chat-completions/usage_test.ts
 create mode 100644 packages/gateway/src/data-plane/llm/responses/usage.ts
 create mode 100644 packages/gateway/src/data-plane/llm/responses/usage_test.ts

diff --git a/packages/gateway/src/data-plane/llm/chat-completions/respond.ts b/packages/gateway/src/data-plane/llm/chat-completions/respond.ts
index 1d3f5b50b..906e6f13b 100644
--- a/packages/gateway/src/data-plane/llm/chat-completions/respond.ts
+++ b/packages/gateway/src/data-plane/llm/chat-completions/respond.ts
@@ -3,11 +3,11 @@ import { streamSSE } from 'hono/streaming';
 
 import { CHAT_COMPLETIONS_MISSING_TERMINAL_MESSAGE, collectChatCompletionsProtocolEventsToResult } from './events/to-result.ts';
 import { chatCompletionsProtocolFrameToSSEFrame } from './events/to-sse.ts';
-import { tokenUsage } from '../../shared/telemetry/usage.ts';
+import { tokenUsageFromChatCompletionsUsage } from './usage.ts';
 import type { GatewayCtx } from '../shared/gateway-ctx.ts';
 import { SourceStreamState, eventResultMetadata, forwardUpstreamHeaders, mergeForwardedUpstreamHeaders, plainResultToResponse, recordPerformance, recordUsage } from '../shared/respond.ts';
 import { type StreamCompletion, writeSSEFrames } from '../shared/stream/sse.ts';
-import type { ChatCompletionsStreamEvent, ChatCompletionsResult } from '@floway-dev/protocols/chat-completions';
+import type { ChatCompletionsStreamEvent } from '@floway-dev/protocols/chat-completions';
 import { chatCompletionsErrorPayloadMessage } from '@floway-dev/protocols/chat-completions';
 import { type ProtocolFrame, sseCommentFrame, sseFrame } from '@floway-dev/protocols/common';
 import { type ExecuteResult, type PlainResult, type InternalDebugError, toInternalDebugError } from '@floway-dev/provider';
@@ -44,7 +44,7 @@ export const respondChatCompletions = async (
     try {
       const response = await collectChatCompletionsProtocolEventsToResult(frames);
       const metadata = await eventResultMetadata(result);
-      const usage = response.usage ? tokenUsageFromChatCompletionsUsage(response.usage) : null;
+      const usage = response.usage ? tokenUsageFromChatCompletionsUsage(response.usage, response.service_tier) : null;
       await recordUsage(ctx, metadata.modelIdentity, usage);
       recordPerformance(ctx, metadata.performance, state.failed);
       return { success: true, response: Response.json(response, { headers: mergeForwardedUpstreamHeaders(undefined, result.headers) }) };
@@ -77,21 +77,6 @@ export const respondChatCompletions = async (
   return { success: true, response };
 };
 
-// --- token usage ---
-
-// OpenAI Chat usage reports prompt_tokens inclusive of cached and
-// cache-creation tokens; subtract them to recover the disjoint bare input.
-const tokenUsageFromChatCompletionsUsage = (u: NonNullable<ChatCompletionsResult['usage']>) => {
-  const cacheRead = u.prompt_tokens_details?.cached_tokens ?? 0;
-  const cacheWrite = u.prompt_tokens_details?.cache_creation_input_tokens ?? 0;
-  return tokenUsage({
-    input: u.prompt_tokens - cacheRead - cacheWrite,
-    input_cache_read: cacheRead,
-    input_cache_write: cacheWrite,
-    output: u.completion_tokens,
-  });
-};
-
 // --- error rendering ---
 
 const internalChatCompletionsErrorPayload = (error: InternalDebugError) => ({
@@ -119,7 +104,7 @@ const observeChatCompletionsFrames = async function* (frames: AsyncIterable<Prot
     const failed = isChatCompletionsFailureFrame(frame);
     if (failed) state.failed = true;
     if (observeUsage) {
-      state.rememberUsage(frame.type === 'event' && Array.isArray(frame.event.choices) && frame.event.choices.length === 0 && frame.event.usage ? tokenUsageFromChatCompletionsUsage(frame.event.usage) : null);
+      state.rememberUsage(frame.type === 'event' && Array.isArray(frame.event.choices) && frame.event.choices.length === 0 && frame.event.usage ? tokenUsageFromChatCompletionsUsage(frame.event.usage, frame.event.service_tier) : null);
     }
     if (isChatCompletionsTerminalFrame(frame) && !failed) state.completed = true;
     yield frame;
diff --git a/packages/gateway/src/data-plane/llm/chat-completions/usage.ts b/packages/gateway/src/data-plane/llm/chat-completions/usage.ts
new file mode 100644
index 000000000..8e655cab8
--- /dev/null
+++ b/packages/gateway/src/data-plane/llm/chat-completions/usage.ts
@@ -0,0 +1,19 @@
+import { billableServiceTier, tokenUsage } from '../../shared/telemetry/usage.ts';
+import type { ChatCompletionsResult } from '@floway-dev/protocols/chat-completions';
+
+// OpenAI Chat usage reports prompt_tokens inclusive of cached and
+// cache-creation tokens; subtract them to recover the disjoint bare input.
+// The top-level `service_tier` echoes the actual processing tier; surface it
+// via `billableServiceTier` so per-tier pricing overrides resolve at
+// recording time. https://developers.openai.com/api/docs/guides/priority-processing
+export const tokenUsageFromChatCompletionsUsage = (u: NonNullable<ChatCompletionsResult['usage']>, serviceTier: string | null | undefined) => {
+  const cacheRead = u.prompt_tokens_details?.cached_tokens ?? 0;
+  const cacheWrite = u.prompt_tokens_details?.cache_creation_input_tokens ?? 0;
+  return tokenUsage({
+    input: u.prompt_tokens - cacheRead - cacheWrite,
+    input_cache_read: cacheRead,
+    input_cache_write: cacheWrite,
+    output: u.completion_tokens,
+    tier: billableServiceTier(serviceTier),
+  });
+};
diff --git a/packages/gateway/src/data-plane/llm/chat-completions/usage_test.ts b/packages/gateway/src/data-plane/llm/chat-completions/usage_test.ts
new file mode 100644
index 000000000..3f84e089e
--- /dev/null
+++ b/packages/gateway/src/data-plane/llm/chat-completions/usage_test.ts
@@ -0,0 +1,76 @@
+import { test } from 'vitest';
+
+import { tokenUsageFromChatCompletionsUsage } from './usage.ts';
+import { assertEquals } from '@floway-dev/test-utils';
+
+test('Chat usage maps disjoint input/cache/output counts and omits tier when service_tier is absent', () => {
+  assertEquals(
+    tokenUsageFromChatCompletionsUsage(
+      { prompt_tokens: 100, completion_tokens: 20, total_tokens: 120, prompt_tokens_details: { cached_tokens: 30 } },
+      null,
+    ),
+    {
+      input: 70,
+      input_cache_read: 30,
+      output: 20,
+    },
+  );
+});
+
+test('Chat usage drops service_tier=default to no-tier', () => {
+  assertEquals(
+    tokenUsageFromChatCompletionsUsage(
+      { prompt_tokens: 10, completion_tokens: 2, total_tokens: 12 },
+      'default',
+    ),
+    {
+      input: 10,
+      output: 2,
+    },
+  );
+});
+
+test('Chat usage forwards service_tier=priority verbatim', () => {
+  assertEquals(
+    tokenUsageFromChatCompletionsUsage(
+      { prompt_tokens: 10, completion_tokens: 2, total_tokens: 12 },
+      'priority',
+    ),
+    {
+      input: 10,
+      output: 2,
+      tier: 'priority',
+    },
+  );
+});
+
+test('Chat usage forwards service_tier=flex verbatim', () => {
+  assertEquals(
+    tokenUsageFromChatCompletionsUsage(
+      { prompt_tokens: 10, completion_tokens: 2, total_tokens: 12 },
+      'flex',
+    ),
+    {
+      input: 10,
+      output: 2,
+      tier: 'flex',
+    },
+  );
+});
+
+test('Chat usage forwards an unknown tier verbatim (forward-compat with a future wire value)', () => {
+  // A future OpenAI value the SDK has not minted yet must reach the billing
+  // record so the operator can backfill a per-tier pricing override for it
+  // rather than have it silently fold into the base bucket.
+  assertEquals(
+    tokenUsageFromChatCompletionsUsage(
+      { prompt_tokens: 10, completion_tokens: 2, total_tokens: 12 },
+      'super-priority',
+    ),
+    {
+      input: 10,
+      output: 2,
+      tier: 'super-priority',
+    },
+  );
+});
diff --git a/packages/gateway/src/data-plane/llm/messages/events/reassemble.ts b/packages/gateway/src/data-plane/llm/messages/events/reassemble.ts
index 71cea87e6..55ee37415 100644
--- a/packages/gateway/src/data-plane/llm/messages/events/reassemble.ts
+++ b/packages/gateway/src/data-plane/llm/messages/events/reassemble.ts
@@ -102,6 +102,7 @@ const applyMessagesUsage = (usage: MessagesUsage, update: Partial<MessagesUsage>
   }
   if (update.cache_creation != null) usage.cache_creation = update.cache_creation;
   if (update.service_tier != null) usage.service_tier = update.service_tier;
+  if (update.speed != null) usage.speed = update.speed;
   if (update.server_tool_use != null) {
     usage.server_tool_use = update.server_tool_use;
   }
diff --git a/packages/gateway/src/data-plane/llm/messages/respond.ts b/packages/gateway/src/data-plane/llm/messages/respond.ts
index 3d65dbc3f..ef8f61a2e 100644
--- a/packages/gateway/src/data-plane/llm/messages/respond.ts
+++ b/packages/gateway/src/data-plane/llm/messages/respond.ts
@@ -3,7 +3,7 @@ import { streamSSE } from 'hono/streaming';
 
 import { MESSAGES_MISSING_TERMINAL_MESSAGE, collectMessagesProtocolEventsToResult } from './events/to-result.ts';
 import { messagesProtocolFrameToSSEFrame } from './events/to-sse.ts';
-import { tokenUsage } from '../../shared/telemetry/usage.ts';
+import { billableServiceTier, tokenUsage } from '../../shared/telemetry/usage.ts';
 import type { GatewayCtx } from '../shared/gateway-ctx.ts';
 import { SourceStreamState, eventResultMetadata, forwardUpstreamHeaders, mergeForwardedUpstreamHeaders, plainResultToResponse, recordPerformance, recordUsage } from '../shared/respond.ts';
 import { type StreamCompletion, writeSSEFrames } from '../shared/stream/sse.ts';
@@ -83,16 +83,29 @@ export const respondMessages = async (
 // (extended-cache-ttl-2025-04-11), split the per-TTL counts onto the 5m and
 // 1h dimensions; the flat `cache_creation_input_tokens` is the sum and is
 // only consulted when the sub-object is absent.
+//
+// Response usage carries two server-stamped tier fields: `speed` (fast mode)
+// and `service_tier` (capacity assignment). Fast mode is documented as
+// unavailable with Priority Tier and the Batch API, so at most one
+// non-`standard` value lands on a single response — prefer `speed` first
+// (the only multi-x override today) then fall through to `service_tier`.
+// `standard` on either side collapses to null so per-tier rows aggregate
+// with base; unknown values flow through verbatim so a future Anthropic
+// release does not silently bill at base.
+//   * https://docs.claude.com/en/build-with-claude/fast-mode
+//   * https://docs.claude.com/en/api/service-tiers
 const tokenUsageFromMessagesUsage = (u: MessagesUsageLike) => {
   const cacheWrite5m = u.cache_creation?.ephemeral_5m_input_tokens;
   const cacheWrite1h = u.cache_creation?.ephemeral_1h_input_tokens;
   const cacheWriteRolledUp = u.cache_creation_input_tokens ?? 0;
+  const tier = billableServiceTier(u.speed) ?? billableServiceTier(u.service_tier);
   return tokenUsage({
     input: u.input_tokens ?? 0,
     input_cache_read: u.cache_read_input_tokens ?? 0,
     input_cache_write: cacheWrite5m ?? cacheWriteRolledUp,
     input_cache_write_1h: cacheWrite1h ?? 0,
     output: u.output_tokens,
+    tier,
   });
 };
 
@@ -122,9 +135,23 @@ export const tokenUsageFromMessagesFrame = (frame: ProtocolFrame<MessagesStreamE
     return { ...state.current };
   }
   if (event.type === 'message_delta' && event.usage) {
+    // Anthropic's wire schema lets a delta re-stamp `speed`/`service_tier`,
+    // and both fields are per-message properties of this billing bucket. A
+    // delta-supplied tier therefore wins; absent that, message_start's tier
+    // carries forward across the bucket. Two branches below: the cache-hit
+    // prompt path (message_start carried zero input, this delta now carries
+    // the real input accounting) rebuilds state.current from the delta and
+    // backfills tier from the prior; the normal path updates the running
+    // output and restamps tier when the delta provides one.
+    const deltaResolved = tokenUsageFromMessagesUsage(event.usage);
     if (!state.gotInputFromStart && event.usage.input_tokens !== undefined) {
-      state.current = tokenUsageFromMessagesUsage(event.usage);
-    } else state.current.output = event.usage.output_tokens;
+      const priorTier = state.current.tier;
+      state.current = deltaResolved;
+      state.current.tier ??= priorTier;
+    } else {
+      state.current.output = event.usage.output_tokens;
+      if (deltaResolved.tier != null) state.current.tier = deltaResolved.tier;
+    }
     return { ...state.current };
   }
   return event.type === 'message_stop' ? { ...state.current } : null;
diff --git a/packages/gateway/src/data-plane/llm/messages/respond_test.ts b/packages/gateway/src/data-plane/llm/messages/respond_test.ts
index 7308ab521..c9fa3277e 100644
--- a/packages/gateway/src/data-plane/llm/messages/respond_test.ts
+++ b/packages/gateway/src/data-plane/llm/messages/respond_test.ts
@@ -221,6 +221,289 @@ test('Messages stream usage falls back to the rolled-up cache_creation when the
   });
 });
 
+test('Messages stream usage captures speed=fast as tier=fast', () => {
+  const state = createMessagesStreamUsageState();
+
+  tokenUsageFromMessagesFrame(
+    eventFrame({
+      type: 'message_start',
+      message: {
+        id: 'msg_1',
+        type: 'message',
+        role: 'assistant',
+        content: [],
+        model: 'claude-opus-4-8',
+        stop_reason: null,
+        stop_sequence: null,
+        usage: { input_tokens: 5, output_tokens: 0, speed: 'fast' },
+      },
+    } satisfies MessagesStreamEvent),
+    state,
+  );
+
+  assertEquals(tokenUsageFromMessagesFrame(stop(), state), {
+    input: 5,
+    tier: 'fast',
+  });
+});
+
+test('Messages stream usage leaves tier unset when speed is standard', () => {
+  const state = createMessagesStreamUsageState();
+
+  tokenUsageFromMessagesFrame(
+    eventFrame({
+      type: 'message_start',
+      message: {
+        id: 'msg_1',
+        type: 'message',
+        role: 'assistant',
+        content: [],
+        model: 'claude-opus-4-8',
+        stop_reason: null,
+        stop_sequence: null,
+        usage: { input_tokens: 5, output_tokens: 0, speed: 'standard' },
+      },
+    } satisfies MessagesStreamEvent),
+    state,
+  );
+
+  assertEquals(tokenUsageFromMessagesFrame(stop(), state), {
+    input: 5,
+  });
+});
+
+test('Messages stream usage forwards service_tier=priority verbatim', () => {
+  const state = createMessagesStreamUsageState();
+
+  tokenUsageFromMessagesFrame(
+    eventFrame({
+      type: 'message_start',
+      message: {
+        id: 'msg_1',
+        type: 'message',
+        role: 'assistant',
+        content: [],
+        model: 'claude-sonnet-4-6',
+        stop_reason: null,
+        stop_sequence: null,
+        usage: { input_tokens: 5, output_tokens: 0, service_tier: 'priority' },
+      },
+    } satisfies MessagesStreamEvent),
+    state,
+  );
+
+  assertEquals(tokenUsageFromMessagesFrame(stop(), state), {
+    input: 5,
+    tier: 'priority',
+  });
+});
+
+test('Messages stream usage forwards service_tier=batch verbatim', () => {
+  const state = createMessagesStreamUsageState();
+
+  tokenUsageFromMessagesFrame(
+    eventFrame({
+      type: 'message_start',
+      message: {
+        id: 'msg_1',
+        type: 'message',
+        role: 'assistant',
+        content: [],
+        model: 'claude-sonnet-4-6',
+        stop_reason: null,
+        stop_sequence: null,
+        usage: { input_tokens: 5, output_tokens: 0, service_tier: 'batch' },
+      },
+    } satisfies MessagesStreamEvent),
+    state,
+  );
+
+  assertEquals(tokenUsageFromMessagesFrame(stop(), state), {
+    input: 5,
+    tier: 'batch',
+  });
+});
+
+test('Messages stream usage forwards an unknown non-standard tier verbatim (forward-compat)', () => {
+  // A future Anthropic value the SDK has not minted yet must reach the
+  // billing record so the operator can backfill a pricing override for it
+  // rather than have it silently fold into the base bucket.
+  const state = createMessagesStreamUsageState();
+
+  tokenUsageFromMessagesFrame(
+    eventFrame({
+      type: 'message_start',
+      message: {
+        id: 'msg_1',
+        type: 'message',
+        role: 'assistant',
+        content: [],
+        model: 'claude-opus-4-8',
+        stop_reason: null,
+        stop_sequence: null,
+        usage: { input_tokens: 5, output_tokens: 0, speed: 'turbo' },
+      },
+    } satisfies MessagesStreamEvent),
+    state,
+  );
+
+  assertEquals(tokenUsageFromMessagesFrame(stop(), state), {
+    input: 5,
+    tier: 'turbo',
+  });
+});
+
+test('Messages stream usage prefers speed=fast over service_tier=standard', () => {
+  // Anthropic stamps both fields on a Priority-Tier-aware account; fast mode
+  // is mutually exclusive with priority/batch per docs, so a `fast` row will
+  // always pair with `service_tier: 'standard'`. The non-standard signal
+  // wins; the redundant 'standard' must not clobber it.
+  const state = createMessagesStreamUsageState();
+
+  tokenUsageFromMessagesFrame(
+    eventFrame({
+      type: 'message_start',
+      message: {
+        id: 'msg_1',
+        type: 'message',
+        role: 'assistant',
+        content: [],
+        model: 'claude-opus-4-8',
+        stop_reason: null,
+        stop_sequence: null,
+        usage: { input_tokens: 5, output_tokens: 0, speed: 'fast', service_tier: 'standard' },
+      },
+    } satisfies MessagesStreamEvent),
+    state,
+  );
+
+  assertEquals(tokenUsageFromMessagesFrame(stop(), state), {
+    input: 5,
+    tier: 'fast',
+  });
+});
+
+test('Messages stream usage carries tier forward when a fully cache-hit start is followed by a delta that re-supplies input', () => {
+  // A fully cache-hit prompt: message_start reports bare input 0 and tier 'fast',
+  // and a later delta carries input_tokens without re-stamping the tier fields.
+  // The delta replaces state.current (gotInputFromStart was false), so without
+  // explicit carry-forward the fast tier would be dropped — and the row would
+  // bill at base.
+  const state = createMessagesStreamUsageState();
+
+  tokenUsageFromMessagesFrame(
+    eventFrame({
+      type: 'message_start',
+      message: {
+        id: 'msg_1',
+        type: 'message',
+        role: 'assistant',
+        content: [],
+        model: 'claude-opus-4-8',
+        stop_reason: null,
+        stop_sequence: null,
+        usage: { input_tokens: 0, output_tokens: 0, speed: 'fast' },
+      },
+    } satisfies MessagesStreamEvent),
+    state,
+  );
+  tokenUsageFromMessagesFrame(
+    eventFrame({
+      type: 'message_delta',
+      delta: {},
+      usage: { input_tokens: 11, output_tokens: 2, cache_read_input_tokens: 5 },
+    } satisfies MessagesStreamEvent),
+    state,
+  );
+
+  assertEquals(tokenUsageFromMessagesFrame(stop(), state), {
+    input: 11,
+    input_cache_read: 5,
+    output: 2,
+    tier: 'fast',
+  });
+});
+
+test('Messages stream usage lets a delta-stamped tier win over message_start on the cache-hit-prompt path', () => {
+  // The wire schema permits message_delta.usage to carry service_tier/speed
+  // (packages/protocols/src/messages/index.ts). If a future upstream reassigns
+  // the served tier between message_start and message_delta — or starts
+  // stamping the served tier only on the delta — the delta value describes
+  // the billing bucket and must replace the start-stamped one.
+  const state = createMessagesStreamUsageState();
+
+  tokenUsageFromMessagesFrame(
+    eventFrame({
+      type: 'message_start',
+      message: {
+        id: 'msg_1',
+        type: 'message',
+        role: 'assistant',
+        content: [],
+        model: 'claude-opus-4-8',
+        stop_reason: null,
+        stop_sequence: null,
+        usage: { input_tokens: 0, output_tokens: 0, speed: 'fast' },
+      },
+    } satisfies MessagesStreamEvent),
+    state,
+  );
+  tokenUsageFromMessagesFrame(
+    eventFrame({
+      type: 'message_delta',
+      delta: {},
+      usage: { input_tokens: 11, output_tokens: 2, service_tier: 'priority' },
+    } satisfies MessagesStreamEvent),
+    state,
+  );
+
+  assertEquals(tokenUsageFromMessagesFrame(stop(), state), {
+    input: 11,
+    output: 2,
+    tier: 'priority',
+  });
+});
+
+test('Messages stream usage lets a delta-stamped tier win on the normal output-only path', () => {
+  // Symmetric to the cache-hit branch: when message_start already carried the
+  // real input accounting (gotInputFromStart === true), the delta normally
+  // just updates the running output. The wire schema still permits the delta
+  // to (re)stamp service_tier/speed, and that signal describes this billing
+  // bucket — must replace what start stamped, not be silently dropped.
+  const state = createMessagesStreamUsageState();
+
+  tokenUsageFromMessagesFrame(
+    eventFrame({
+      type: 'message_start',
+      message: {
+        id: 'msg_1',
+        type: 'message',
+        role: 'assistant',
+        content: [],
+        model: 'claude-opus-4-8',
+        stop_reason: null,
+        stop_sequence: null,
+        usage: { input_tokens: 50, output_tokens: 0, service_tier: 'standard' },
+      },
+    } satisfies MessagesStreamEvent),
+    state,
+  );
+  tokenUsageFromMessagesFrame(
+    eventFrame({
+      type: 'message_delta',
+      delta: {},
+      usage: { output_tokens: 7, service_tier: 'priority' },
+    } satisfies MessagesStreamEvent),
+    state,
+  );
+
+  assertEquals(tokenUsageFromMessagesFrame(stop(), state), {
+    input: 50,
+    output: 7,
+    tier: 'priority',
+  });
+});
+
 // --- header forwarding ---
 
 const forwardedHeadersFixture = (): Headers => new Headers({
diff --git a/packages/gateway/src/data-plane/llm/responses/respond.ts b/packages/gateway/src/data-plane/llm/responses/respond.ts
index e50712060..f40db6ac6 100644
--- a/packages/gateway/src/data-plane/llm/responses/respond.ts
+++ b/packages/gateway/src/data-plane/llm/responses/respond.ts
@@ -3,12 +3,12 @@ import { streamSSE } from 'hono/streaming';
 
 import { RESPONSES_MISSING_TERMINAL_MESSAGE, collectResponsesProtocolEventsToResult } from './events/to-result.ts';
 import { responsesProtocolFrameToSSEFrame } from './events/to-sse.ts';
-import { tokenUsage } from '../../shared/telemetry/usage.ts';
+import { tokenUsageFromResponsesResult } from './usage.ts';
 import type { GatewayCtx } from '../shared/gateway-ctx.ts';
 import { SourceStreamState, eventResultMetadata, forwardUpstreamHeaders, mergeForwardedUpstreamHeaders, plainResultToResponse, recordPerformance, recordUsage } from '../shared/respond.ts';
 import { type StreamCompletion, writeSSEFrames } from '../shared/stream/sse.ts';
 import { type ProtocolFrame, sseCommentFrame, sseFrame } from '@floway-dev/protocols/common';
-import { isResponsesTerminalEvent, type ResponsesResult, type ResponsesStreamEvent, responsesResultFromStreamEvent } from '@floway-dev/protocols/responses';
+import { isResponsesTerminalEvent, type ResponsesStreamEvent, responsesResultFromStreamEvent } from '@floway-dev/protocols/responses';
 import { type ExecuteResult, type PlainResult, type InternalDebugError, toInternalDebugError } from '@floway-dev/provider';
 import { upstreamErrorToResponse } from '@floway-dev/provider';
 
@@ -74,21 +74,6 @@ export const respondResponses = async (
   return { success: true, response };
 };
 
-// --- token usage ---
-
-// OpenAI Responses reports input_tokens inclusive of cached tokens; subtract
-// the cached split to recover the disjoint bare input.
-const tokenUsageFromResponsesResult = (r: ResponsesResult) => {
-  const u = r.usage;
-  if (!u) return null;
-  const cacheRead = u.input_tokens_details?.cached_tokens ?? 0;
-  return tokenUsage({
-    input: u.input_tokens - cacheRead,
-    input_cache_read: cacheRead,
-    output: u.output_tokens,
-  });
-};
-
 // --- error rendering ---
 
 const internalResponsesErrorResponse = (status: number, error: InternalDebugError): Response =>
diff --git a/packages/gateway/src/data-plane/llm/responses/usage.ts b/packages/gateway/src/data-plane/llm/responses/usage.ts
new file mode 100644
index 000000000..91960da58
--- /dev/null
+++ b/packages/gateway/src/data-plane/llm/responses/usage.ts
@@ -0,0 +1,21 @@
+import { billableServiceTier, tokenUsage } from '../../shared/telemetry/usage.ts';
+import type { ResponsesResult } from '@floway-dev/protocols/responses';
+
+// OpenAI Responses reports input_tokens inclusive of cached tokens; subtract
+// the cached split to recover the disjoint bare input. The top-level
+// `service_tier` echoes the actual processing tier the upstream served the
+// request at (e.g. `default` when capacity downgraded a `priority` request).
+// We surface it via `billableServiceTier` so per-tier pricing overrides
+// resolve at recording time.
+// https://developers.openai.com/api/docs/guides/priority-processing
+export const tokenUsageFromResponsesResult = (response: ResponsesResult) => {
+  const usage = response.usage;
+  if (!usage) return null;
+  const cacheRead = usage.input_tokens_details?.cached_tokens ?? 0;
+  return tokenUsage({
+    input: usage.input_tokens - cacheRead,
+    input_cache_read: cacheRead,
+    output: usage.output_tokens,
+    tier: billableServiceTier(response.service_tier),
+  });
+};
diff --git a/packages/gateway/src/data-plane/llm/responses/usage_test.ts b/packages/gateway/src/data-plane/llm/responses/usage_test.ts
new file mode 100644
index 000000000..cdf846c9a
--- /dev/null
+++ b/packages/gateway/src/data-plane/llm/responses/usage_test.ts
@@ -0,0 +1,81 @@
+import { test } from 'vitest';
+
+import { tokenUsageFromResponsesResult } from './usage.ts';
+import type { ResponsesResult } from '@floway-dev/protocols/responses';
+import { assertEquals } from '@floway-dev/test-utils';
+
+// Bare minimum ResponsesResult to exercise the usage extractor. The mapper
+// only touches `usage` and `service_tier`; the rest of the response shape is
+// irrelevant to billing.
+const minimalResult = (overrides: Partial<ResponsesResult>): ResponsesResult => ({
+  id: 'resp_1',
+  object: 'response',
+  model: 'gpt-test',
+  output: [],
+  status: 'completed',
+  incomplete_details: null,
+  error: null,
+  ...overrides,
+});
+
+test('Responses usage maps disjoint input/cache/output counts and omits tier when service_tier is absent', () => {
+  const result = minimalResult({
+    usage: { input_tokens: 100, output_tokens: 20, total_tokens: 120, input_tokens_details: { cached_tokens: 30 } },
+  });
+  assertEquals(tokenUsageFromResponsesResult(result), {
+    input: 70,
+    input_cache_read: 30,
+    output: 20,
+  });
+});
+
+test('Responses usage drops service_tier=default (OpenAI base value) to no-tier', () => {
+  const result = minimalResult({
+    usage: { input_tokens: 10, output_tokens: 2, total_tokens: 12 },
+    service_tier: 'default',
+  });
+  assertEquals(tokenUsageFromResponsesResult(result), {
+    input: 10,
+    output: 2,
+  });
+});
+
+test('Responses usage forwards service_tier=priority verbatim', () => {
+  const result = minimalResult({
+    usage: { input_tokens: 10, output_tokens: 2, total_tokens: 12 },
+    service_tier: 'priority',
+  });
+  assertEquals(tokenUsageFromResponsesResult(result), {
+    input: 10,
+    output: 2,
+    tier: 'priority',
+  });
+});
+
+test('Responses usage forwards service_tier=flex verbatim', () => {
+  const result = minimalResult({
+    usage: { input_tokens: 10, output_tokens: 2, total_tokens: 12 },
+    service_tier: 'flex',
+  });
+  assertEquals(tokenUsageFromResponsesResult(result), {
+    input: 10,
+    output: 2,
+    tier: 'flex',
+  });
+});
+
+test('Responses usage forwards an unknown tier verbatim (forward-compat with a future wire value)', () => {
+  const result = minimalResult({
+    usage: { input_tokens: 10, output_tokens: 2, total_tokens: 12 },
+    service_tier: 'batch',
+  });
+  assertEquals(tokenUsageFromResponsesResult(result), {
+    input: 10,
+    output: 2,
+    tier: 'batch',
+  });
+});
+
+test('Responses usage returns null when the upstream omits the usage object', () => {
+  assertEquals(tokenUsageFromResponsesResult(minimalResult({})), null);
+});
diff --git a/packages/gateway/src/data-plane/llm/responses/websocket.ts b/packages/gateway/src/data-plane/llm/responses/websocket.ts
index dbbd85579..1aadc245a 100644
--- a/packages/gateway/src/data-plane/llm/responses/websocket.ts
+++ b/packages/gateway/src/data-plane/llm/responses/websocket.ts
@@ -4,8 +4,8 @@ import { RESPONSES_MISSING_TERMINAL_MESSAGE } from './events/to-result.ts';
 import { createResponsesWsSession } from './items/store.ts';
 import { PreviousResponseNotFoundError } from './serve-prep.ts';
 import { responsesServe } from './serve.ts';
+import { tokenUsageFromResponsesResult } from './usage.ts';
 import { inboundHeadersForUpstream } from '../../shared/inbound-headers.ts';
-import { tokenUsage } from '../../shared/telemetry/usage.ts';
 import { createGatewayCtxForWs, type GatewayCtx } from '../shared/gateway-ctx.ts';
 import { SourceStreamState, eventResultMetadata, recordPerformance, recordUsage } from '../shared/respond.ts';
 import { DOWNSTREAM_KEEP_ALIVE_INTERVAL_MS, type StreamCompletion } from '../shared/stream/sse.ts';
@@ -399,17 +399,6 @@ const serverErrorEnvelope = (error: unknown): Record<string, unknown> => ({
   code: 'internal_error',
 });
 
-const tokenUsageFromResponsesResult = (response: ResponsesResult) => {
-  const usage = response.usage;
-  if (!usage) return null;
-  const cacheRead = usage.input_tokens_details?.cached_tokens ?? 0;
-  return tokenUsage({
-    input: usage.input_tokens - cacheRead,
-    input_cache_read: cacheRead,
-    output: usage.output_tokens,
-  });
-};
-
 const responseDoneSummary = (event: unknown) => {
   if (!event || typeof event !== 'object') return null;
   const type = (event as { type?: unknown }).type;
diff --git a/packages/gateway/src/data-plane/shared/telemetry/usage.ts b/packages/gateway/src/data-plane/shared/telemetry/usage.ts
index 7b32e8ad8..8d1aa89ee 100644
--- a/packages/gateway/src/data-plane/shared/telemetry/usage.ts
+++ b/packages/gateway/src/data-plane/shared/telemetry/usage.ts
@@ -7,15 +7,21 @@ import type { TelemetryModelIdentity } from '@floway-dev/provider';
 export const hasTokenUsage = (usage: TokenUsage): boolean => BILLING_DIMENSIONS.some(dimension => (usage[dimension] ?? 0) > 0);
 
 // Map an upstream-reported service tier onto the tier marker the gateway
-// stores on the usage row. `default` and `auto` (OpenAI's response-side base
-// values) and `standard` (Anthropic's response-side base value) all denote
-// base pricing and collapse to null so they aggregate with rows that carry
-// no tier at all.
+// stores on the usage row. `default` (OpenAI's response-side base value) and
+// `standard` (Anthropic's response-side base value) both denote base pricing
+// and collapse to null so they aggregate with rows that carry no tier at all.
+// Compared case-insensitively in case a future upstream stamps `'Default'`
+// or `'STANDARD'` (defensive — both protocols' SDKs ship the values in
+// lowercase today); non-base values pass through with their original
+// casing so per-tier overrides match the wire-stamped string verbatim.
 // https://developers.openai.com/api/docs/guides/priority-processing
 // https://docs.claude.com/en/api/service-tiers
 // https://docs.claude.com/en/build-with-claude/fast-mode
-export const billableServiceTier = (tier: string | null | undefined): string | null =>
-  tier != null && tier !== 'default' && tier !== 'auto' && tier !== 'standard' ? tier : null;
+export const billableServiceTier = (tier: string | null | undefined): string | null => {
+  if (tier == null) return null;
+  const normalized = tier.toLowerCase();
+  return normalized === 'default' || normalized === 'standard' ? null : tier;
+};
 
 // Drop zero / undefined dimensions so a usage map only carries the dimensions
 // actually billed. `tier` (a non-numeric service-tier marker) survives the
@@ -88,6 +94,7 @@ const splitModalityCounts = (
 };
 
 export const recordTokenUsage = async (keyId: string, modelIdentity: TelemetryModelIdentity, usage: TokenUsage): Promise<void> => {
+  const { tier, ...tokens } = usage;
   await Promise.all([
     getRepo().usage.record({
       keyId,
@@ -95,9 +102,9 @@ export const recordTokenUsage = async (keyId: string, modelIdentity: TelemetryMo
       upstream: modelIdentity.upstream,
       modelKey: modelIdentity.modelKey,
       hour: currentHour(),
-      tier: usage.tier ?? null,
+      tier: tier ?? null,
       requests: 1,
-      tokens: usage,
+      tokens,
       cost: modelIdentity.cost,
     }),
     (async () => {
diff --git a/packages/protocols/src/chat-completions/index.ts b/packages/protocols/src/chat-completions/index.ts
index fabe3f32e..394bd471c 100644
--- a/packages/protocols/src/chat-completions/index.ts
+++ b/packages/protocols/src/chat-completions/index.ts
@@ -20,7 +20,7 @@ export interface ChatCompletionsPayload {
   reasoning_effort?: string | null;
   prompt_cache_key?: string | null;
   safety_identifier?: string | null;
-  service_tier?: string | null;
+  service_tier?: 'default' | 'auto' | 'flex' | 'priority' | 'scale' | (string & {}) | null;
   tools?: ChatCompletionsTool[] | null;
   tool_choice?: 'none' | 'auto' | 'required' | { type: 'function'; function: { name: string } } | null;
   /** Request usage stats in streaming responses */
@@ -82,6 +82,8 @@ export interface ChatCompletionsResult {
   created: number;
   model: string;
   choices: ChatCompletionsChoiceNonStreaming[];
+  // https://platform.openai.com/docs/api-reference/chat/object
+  service_tier?: 'default' | 'auto' | 'flex' | 'priority' | 'scale' | (string & {}) | null;
   usage?: ChatCompletionsUsage;
 }
 
@@ -91,6 +93,7 @@ export interface ChatCompletionsStreamEvent {
   created: number;
   model: string;
   choices: ChatCompletionsChoiceStreaming[];
+  service_tier?: 'default' | 'auto' | 'flex' | 'priority' | 'scale' | (string & {}) | null;
   usage?: ChatCompletionsUsage;
 }
 
diff --git a/packages/protocols/src/messages/index.ts b/packages/protocols/src/messages/index.ts
index 10076c1a8..089b34d29 100644
--- a/packages/protocols/src/messages/index.ts
+++ b/packages/protocols/src/messages/index.ts
@@ -233,7 +233,10 @@ export interface MessagesUsage {
     ephemeral_5m_input_tokens?: number;
     ephemeral_1h_input_tokens?: number;
   };
-  service_tier?: 'standard' | 'priority' | 'batch';
+  // https://docs.claude.com/en/api/service-tiers
+  service_tier?: 'standard' | 'priority' | 'batch' | (string & {});
+  // https://docs.claude.com/en/build-with-claude/fast-mode
+  speed?: 'standard' | 'fast' | (string & {});
   server_tool_use?: MessagesUsageServerToolUse;
 }
 
@@ -312,6 +315,8 @@ export interface MessagesMessageDeltaEvent {
       ephemeral_5m_input_tokens?: number;
       ephemeral_1h_input_tokens?: number;
     };
+    service_tier?: 'standard' | 'priority' | 'batch' | (string & {});
+    speed?: 'standard' | 'fast' | (string & {});
     server_tool_use?: MessagesUsageServerToolUse;
   };
 }
diff --git a/packages/protocols/src/responses/index.ts b/packages/protocols/src/responses/index.ts
index 907bd0e42..a1aed6518 100644
--- a/packages/protocols/src/responses/index.ts
+++ b/packages/protocols/src/responses/index.ts
@@ -32,7 +32,7 @@ export interface ResponsesPayload {
   text?: { format?: Record<string, unknown> | null } | null;
   prompt_cache_key?: string | null;
   safety_identifier?: string | null;
-  service_tier?: string | null;
+  service_tier?: 'default' | 'auto' | 'flex' | 'priority' | 'scale' | (string & {}) | null;
 }
 
 // Narrower payload for `/responses/compact`. The official endpoint accepts a
@@ -50,7 +50,7 @@ export interface ResponsesCompactPayload {
   previous_response_id?: string | null;
   prompt_cache_key?: string | null;
   prompt_cache_retention?: 'in_memory' | '24h' | null;
-  service_tier?: string | null;
+  service_tier?: 'default' | 'auto' | 'flex' | 'priority' | 'scale' | (string & {}) | null;
   // Gateway-only: controls whether the compact response's output items + the
   // committed snapshot persist. Forwarded NEITHER to upstream nor to the
   // provider call body.
@@ -395,6 +395,8 @@ export interface ResponsesResult {
   // never synthesizes it.
   incomplete_details: { reason: string } | null;
   error: { message: string; code: string; type?: string } | null;
+  // https://developers.openai.com/api/reference/resources/responses/methods/create
+  service_tier?: 'default' | 'auto' | 'flex' | 'priority' | 'scale' | (string & {}) | null;
   usage?: {
     input_tokens: number;
     output_tokens: number;

From d23f6a5ef2dec43ade318d06c327d498cd0d1128 Mon Sep 17 00:00:00 2001
From: Menci <mencici@msn.com>
Date: Sat, 20 Jun 2026 04:04:29 +0800
Subject: [PATCH 3/6] feat(codex): price flex/priority service tiers per OpenAI
 public rates
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add `tiers.flex` and `tiers.priority` overlays for every priced Codex slug
so the dashboard's notional cost reflects which OpenAI service tier the
request actually ran on. The gateway already captures `usage.service_tier`
onto `TokenUsage.tier`; this commit completes the loop by giving the cost
compute a per-tier rate row to look up.

Tier overrides match OpenAI's public pricing (verified 2026-06-19 against
https://platform.openai.com/docs/pricing):

  gpt-5.5         flex $2.5/$0.25/$15      priority $12.5/$1.25/$75
  gpt-5.4         flex $1.25/$0.13/$7.5    priority $5/$0.5/$30
  gpt-5.4-mini    flex $0.375/$0.0375/$2.25  priority $1.5/$0.15/$9

`codex-auto-review` shares `gpt-5.4`'s pricing including the tier
overrides. Codex CLI's `/fast` toggle writes `service_tier: "priority"` on
the wire (per openai/codex's `ServiceTier::Fast.request_value()`), so
operator-facing rows tagged "fast" cost out at the priority row.

Cache-write rate stays unset on these entries — OpenAI charges cache
creation at the same rate as input, which `unitPriceForDimension`'s
fallback chain already covers.
---
 packages/provider-codex/src/models_test.ts | 37 ++++++++++++++-
 packages/provider-codex/src/pricing.ts     | 54 ++++++++++++++++++----
 2 files changed, 82 insertions(+), 9 deletions(-)

diff --git a/packages/provider-codex/src/models_test.ts b/packages/provider-codex/src/models_test.ts
index 116daa8e6..edf04b75d 100644
--- a/packages/provider-codex/src/models_test.ts
+++ b/packages/provider-codex/src/models_test.ts
@@ -2,6 +2,7 @@ import { afterEach, describe, expect, test, vi } from 'vitest';
 
 import { CODEX_CLI_VERSION } from './constants.ts';
 import { codexRawToUpstreamModel, fetchCodexCatalog } from './models.ts';
+import { resolveEffectivePricing } from '@floway-dev/protocols/common';
 import { directFetcher } from '@floway-dev/provider';
 
 const okJson = (body: unknown): Response => new Response(JSON.stringify(body), { status: 200, headers: { 'content-type': 'application/json' } });
@@ -76,11 +77,45 @@ describe('codexRawToUpstreamModel', () => {
 
   test('attaches OpenAI-API-rate cost for known slugs and treats codex-auto-review as gpt-5.4', () => {
     const flagship = codexRawToUpstreamModel({ id: 'gpt-5.4', display_name: 'GPT-5.4', context_window: 272000 }, noFlags);
-    expect(flagship.cost).toEqual({ input: 2.5, input_cache_read: 0.25, output: 15 });
+    expect(flagship.cost).toEqual({
+      input: 2.5,
+      input_cache_read: 0.25,
+      output: 15,
+      tiers: {
+        flex: { input: 1.25, input_cache_read: 0.13, output: 7.5 },
+        priority: { input: 5, input_cache_read: 0.5, output: 30 },
+      },
+    });
     const review = codexRawToUpstreamModel({ id: 'codex-auto-review', display_name: 'Codex Auto Review', context_window: 272000 }, noFlags);
     expect(review.cost).toEqual(flagship.cost);
   });
 
+  // End-to-end resolution check: tier keys must match the wire-value strings
+  // billableServiceTier persists, not the enum *names* in Codex's Rust source.
+  // A casing typo here (e.g. `Flex`) or a divergence from the wire value (e.g.
+  // `fast`) would compile cleanly against the structural test above but bill
+  // every tiered request at base.
+  test('cost.tiers keys resolve through resolveEffectivePricing for the wire-value strings', () => {
+    const flagship = codexRawToUpstreamModel({ id: 'gpt-5.4', display_name: 'GPT-5.4', context_window: 272000 }, noFlags);
+    if (!flagship.cost) throw new Error('expected cost to be defined');
+
+    expect(resolveEffectivePricing(flagship.cost, 'priority')).toEqual({
+      input: 5,
+      input_cache_read: 0.5,
+      output: 30,
+    });
+    expect(resolveEffectivePricing(flagship.cost, 'flex')).toEqual({
+      input: 1.25,
+      input_cache_read: 0.13,
+      output: 7.5,
+    });
+    expect(resolveEffectivePricing(flagship.cost, null)).toEqual({
+      input: 2.5,
+      input_cache_read: 0.25,
+      output: 15,
+    });
+  });
+
   test('omits cost for unknown slugs (forward-compat with new upstream models)', () => {
     const m = codexRawToUpstreamModel({ id: 'gpt-future-unreleased', display_name: 'X', context_window: 1 }, noFlags);
     expect(m.cost).toBeUndefined();
diff --git a/packages/provider-codex/src/pricing.ts b/packages/provider-codex/src/pricing.ts
index 07df17d8d..4895e157f 100644
--- a/packages/provider-codex/src/pricing.ts
+++ b/packages/provider-codex/src/pricing.ts
@@ -8,25 +8,63 @@
 // https://github.com/anomalyco/models.dev/blob/8e6d393c01cb42d41a92f18725eef545e7190efb/packages/core/src/schema.ts
 //
 // Source of truth for OpenAI public API prices the table is derived from:
-// https://openai.com/api/pricing/
+// https://developers.openai.com/api/docs/pricing
+// Refresh procedure: .agents/skills/fetching-models-pricing/.
+//
+// Per-tier overrides cover the two OpenAI service-tier wire values reachable
+// through the Codex CLI's `ServiceTier` enum (`priority` / `flex`):
+//   - `flex` — discounted, latency-tolerant; the CLI sets `service_tier: "flex"`.
+//     https://developers.openai.com/api/docs/guides/flex-processing
+//   - `priority` — premium-priced, lower-latency lane; the CLI's `/fast` toggle
+//     stamps `service_tier: "priority"`.
+//     https://developers.openai.com/api/docs/guides/priority-processing
+// https://github.com/openai/codex/blob/f774455c3a831dfab2c6f37a1f624b8097f6f2c2/codex-rs/protocol/src/config_types.rs#L445
+// Whether a request actually goes through at the requested tier depends on
+// what each model's catalog entry (`service_tiers` block in upstream
+// `models.json`) accepts and on remaining capacity; OpenAI reports the
+// actually-served tier in `usage.service_tier` and the gateway captures it
+// onto `TokenUsage.tier` so cost compute picks the right row.
 //
 // Coverage: every slug surfaced by /codex/models for ChatGPT Plus today
 // (gpt-5.5, gpt-5.4, gpt-5.4-mini, codex-auto-review). New slugs the upstream
 // rolls out at higher plans (Pro / Team / Enterprise) should be added here so
 // the dashboard reports their cost too.
-//
-// Refresh procedure: .agents/skills/fetching-models-pricing/.
 
 import type { ModelPricing } from '@floway-dev/protocols/common';
 
-const GPT_5_4_PRICING: ModelPricing = { input: 2.5, input_cache_read: 0.25, output: 15 };
+const GPT_5_4_PRICING: ModelPricing = {
+  input: 2.5,
+  input_cache_read: 0.25,
+  output: 15,
+  tiers: {
+    flex: { input: 1.25, input_cache_read: 0.13, output: 7.5 },
+    priority: { input: 5, input_cache_read: 0.5, output: 30 },
+  },
+};
 
 const CODEX_MODEL_PRICING: readonly (readonly [key: string | RegExp, pricing: ModelPricing])[] = [
-  ['gpt-5.5', { input: 5, input_cache_read: 0.5, output: 30 }],
+  ['gpt-5.5', {
+    input: 5,
+    input_cache_read: 0.5,
+    output: 30,
+    tiers: {
+      flex: { input: 2.5, input_cache_read: 0.25, output: 15 },
+      priority: { input: 12.5, input_cache_read: 1.25, output: 75 },
+    },
+  }],
   ['gpt-5.4', GPT_5_4_PRICING],
-  ['gpt-5.4-mini', { input: 0.75, input_cache_read: 0.075, output: 4.5 }],
-  // Internal review model gated under codex_cli_rs's auto-review feature; runs
-  // on the same compute as gpt-5.4 and is billed identically.
+  ['gpt-5.4-mini', {
+    input: 0.75,
+    input_cache_read: 0.075,
+    output: 4.5,
+    tiers: {
+      flex: { input: 0.375, input_cache_read: 0.0375, output: 2.25 },
+      priority: { input: 1.5, input_cache_read: 0.15, output: 9 },
+    },
+  }],
+  // Internal review model gated under codex_cli_rs's auto-review feature. No
+  // public price surface; billed as a notional clone of gpt-5.4 (closest
+  // analogue we have).
   ['codex-auto-review', GPT_5_4_PRICING],
 ];
 

From 1914062ab1338e8b81badd156754d93657767a6d Mon Sep 17 00:00:00 2001
From: Menci <mencici@msn.com>
Date: Sat, 20 Jun 2026 11:52:35 +0800
Subject: [PATCH 4/6] feat(gateway): migration 0036 adds tier column to usage +
 usage_requests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR 69's runtime already keys usage buckets on TokenUsage.tier via
`COALESCE(tier, '')` and the repo writes `tier` into `usage` and
`usage_requests`, but the schema column didn't exist yet. Add it via a
dedicated migration so PR 68 can stay focused on the per-TTL cache
dimension and the tier column lives in this PR's blast radius.

The CHECK list on `usage.dimension` is widened here to admit
`input_cache_write_1h` as well so 0035 and 0036 converge on the same end
state regardless of merge order — the codebase on this branch doesn't
write that dimension yet (PR 68 ships the parser), but the schema is
forward-compatible.

Also extend `pricingField` (and the control-plane zod schema) to admit
`cost.tiers`, so an operator-supplied per-tier overlay round-trips
through the upstream save path instead of being silently dropped at
parse time. Without this the per-tier editor in the next commit cannot
persist anything.
---
 .../migrations/0036_usage_tier_column.sql     | 56 +++++++++++++++++++
 .../src/control-plane/data-transfer/routes.ts | 10 +++-
 .../data-transfer/routes_test.ts              |  2 +-
 packages/gateway/src/control-plane/schemas.ts | 28 +++++++---
 packages/provider/src/model-config.ts         | 18 +++++-
 packages/provider/src/model-config_test.ts    | 56 +++++++++++++++++++
 6 files changed, 158 insertions(+), 12 deletions(-)
 create mode 100644 packages/gateway/migrations/0036_usage_tier_column.sql
 create mode 100644 packages/provider/src/model-config_test.ts

diff --git a/packages/gateway/migrations/0036_usage_tier_column.sql b/packages/gateway/migrations/0036_usage_tier_column.sql
new file mode 100644
index 000000000..e75125079
--- /dev/null
+++ b/packages/gateway/migrations/0036_usage_tier_column.sql
@@ -0,0 +1,56 @@
+-- Add the per-request service tier column to `usage` + `usage_requests`.
+--
+-- `tier` is the upstream-stamped service-tier marker (Anthropic `usage.speed`,
+-- OpenAI `usage.service_tier`). It participates in bucket identity so a model
+-- billed at multiple tiers in one hour aggregates as separate buckets with
+-- distinct unit prices; recording writes NULL for base-tier requests and a
+-- non-empty string otherwise. The unique index uses `COALESCE(tier, '')`
+-- because SQLite treats NULLs as distinct under UNIQUE.
+--
+-- SQLite cannot add a column to the middle of a UNIQUE INDEX in place, so
+-- both tables are recreated. Existing rows backfill `tier = NULL`, which the
+-- aggregator treats as base pricing — historical buckets compute identically.
+
+CREATE TABLE usage_new (
+  key_id TEXT NOT NULL,
+  model TEXT NOT NULL,
+  upstream TEXT,
+  model_key TEXT NOT NULL,
+  hour TEXT NOT NULL,
+  tier TEXT,
+  dimension TEXT NOT NULL CHECK (dimension IN (
+    'input', 'input_cache_read', 'input_cache_write', 'input_cache_write_1h', 'input_image', 'output', 'output_image'
+  )),
+  tokens INTEGER NOT NULL DEFAULT 0,
+  unit_price REAL
+);
+
+INSERT INTO usage_new (key_id, model, upstream, model_key, hour, tier, dimension, tokens, unit_price)
+  SELECT key_id, model, upstream, model_key, hour, NULL, dimension, tokens, unit_price FROM usage;
+
+DROP TABLE usage;
+ALTER TABLE usage_new RENAME TO usage;
+
+CREATE UNIQUE INDEX idx_usage_dimension_identity
+  ON usage (key_id, model, COALESCE(upstream, ''), model_key, hour, COALESCE(tier, ''), dimension);
+CREATE INDEX idx_usage_dimension_hour ON usage (hour);
+
+CREATE TABLE usage_requests_new (
+  key_id TEXT NOT NULL,
+  model TEXT NOT NULL,
+  upstream TEXT,
+  model_key TEXT NOT NULL,
+  hour TEXT NOT NULL,
+  tier TEXT,
+  requests INTEGER NOT NULL DEFAULT 0
+);
+
+INSERT INTO usage_requests_new (key_id, model, upstream, model_key, hour, tier, requests)
+  SELECT key_id, model, upstream, model_key, hour, NULL, requests FROM usage_requests;
+
+DROP TABLE usage_requests;
+ALTER TABLE usage_requests_new RENAME TO usage_requests;
+
+CREATE UNIQUE INDEX idx_usage_requests_identity
+  ON usage_requests (key_id, model, COALESCE(upstream, ''), model_key, hour, COALESCE(tier, ''));
+CREATE INDEX idx_usage_requests_hour ON usage_requests (hour);
diff --git a/packages/gateway/src/control-plane/data-transfer/routes.ts b/packages/gateway/src/control-plane/data-transfer/routes.ts
index 780a0318d..52cf72d71 100644
--- a/packages/gateway/src/control-plane/data-transfer/routes.ts
+++ b/packages/gateway/src/control-plane/data-transfer/routes.ts
@@ -403,10 +403,14 @@ const parseUsageRecords = (value: unknown): { type: 'ok'; records: UsageRecord[]
       return { type: 'invalid', index: i, error: 'upstream must use a raw upstream id, not a legacy provider-prefixed identity' };
     }
     if (record.tier !== undefined && record.tier !== null && typeof record.tier !== 'string') {
-      return { type: 'invalid', index: i, error: 'record has invalid tier (must be a string or null)' };
+      return { type: 'invalid', index: i, error: 'tier, when present, must be a string or null' };
     }
-    // `tier` is absent on exports taken before the column existed; collapse
-    // the absent and explicit-null cases into the same wire value.
+    if (record.tier === '') {
+      return { type: 'invalid', index: i, error: 'tier must be a non-empty string or null/absent' };
+    }
+    // Empty-string is rejected rather than normalized to null: the unique
+    // index folds NULL/'' under COALESCE, so a '' import would silently
+    // merge with base-tier rows.
     const tier: string | null = typeof record.tier === 'string' ? record.tier : null;
     const tokensResult = parseImportedTokens(record.tokens);
     if (tokensResult.type === 'invalid') return { type: 'invalid', index: i, error: 'record has invalid token dimension counts' };
diff --git a/packages/gateway/src/control-plane/data-transfer/routes_test.ts b/packages/gateway/src/control-plane/data-transfer/routes_test.ts
index d43d7a975..de3c08ac0 100644
--- a/packages/gateway/src/control-plane/data-transfer/routes_test.ts
+++ b/packages/gateway/src/control-plane/data-transfer/routes_test.ts
@@ -177,7 +177,7 @@ const USAGE_1: UsageRecord = {
   upstream: 'up_copilot_a',
   modelKey: 'claude-opus-4.7',
   hour: '2026-01-01T10',
-  tier: null,
+  tier: 'fast',
   requests: 5,
   tokens: { input: 1000, output: 500, input_cache_read: 120, input_cache_write: 80 },
   cost: null,
diff --git a/packages/gateway/src/control-plane/schemas.ts b/packages/gateway/src/control-plane/schemas.ts
index 6857e3eab..6de95e9ec 100644
--- a/packages/gateway/src/control-plane/schemas.ts
+++ b/packages/gateway/src/control-plane/schemas.ts
@@ -60,6 +60,18 @@ const modelEndpointsSchema = z.object({
   imagesEdits: z.object({}).optional(),
 });
 
+// Shared between base pricing and per-tier overlays so the two always carry
+// the same dimension set.
+const pricingDimensionShape = {
+  input: z.number().nonnegative().optional(),
+  output: z.number().nonnegative().optional(),
+  input_cache_read: z.number().nonnegative().optional(),
+  input_cache_write: z.number().nonnegative().optional(),
+  input_cache_write_1h: z.number().nonnegative().optional(),
+  input_image: z.number().nonnegative().optional(),
+  output_image: z.number().nonnegative().optional(),
+};
+
 // Mirrors the runtime UpstreamModelConfig in @floway-dev/provider.
 // Azure and custom upstreams share this per-model entry; the canonical
 // per-model endpoint validation lives in the runtime validator.
@@ -70,13 +82,15 @@ const upstreamModelSchema = z.object({
   endpoints: modelEndpointsSchema,
   display_name: z.string().optional(),
   cost: z.object({
-    input: z.number().optional(),
-    output: z.number().optional(),
-    input_cache_read: z.number().optional(),
-    input_cache_write: z.number().optional(),
-    input_cache_write_1h: z.number().optional(),
-    input_image: z.number().optional(),
-    output_image: z.number().optional(),
+    ...pricingDimensionShape,
+    // See ModelPricing.tiers in @floway-dev/protocols/common for semantics.
+    tiers: z.record(
+      z.string().min(1),
+      z.object(pricingDimensionShape).refine(
+        t => Object.values(t).some(v => v !== undefined),
+        { message: 'tier overlay must declare at least one rate' },
+      ),
+    ).optional(),
   }).optional(),
   flagOverrides: z.object({
     enabled: z.boolean(),
diff --git a/packages/provider/src/model-config.ts b/packages/provider/src/model-config.ts
index da1692d40..ecd8e1e0c 100644
--- a/packages/provider/src/model-config.ts
+++ b/packages/provider/src/model-config.ts
@@ -1,5 +1,5 @@
 import { isKnownFlagId } from './flags.ts';
-import { BILLING_DIMENSIONS, type ModelEndpointKey, type ModelEndpoints, type ModelKind, type ModelPricing } from '@floway-dev/protocols/common';
+import { BILLING_DIMENSIONS, type BillingDimension, type ModelEndpointKey, type ModelEndpoints, type ModelKind, type ModelPricing } from '@floway-dev/protocols/common';
 import { kindForEndpoints } from '@floway-dev/protocols/common';
 
 export interface UpstreamModelLimits {
@@ -127,6 +127,22 @@ export const pricingField = (value: unknown, label: string): ModelPricing | unde
   for (const dimension of BILLING_DIMENSIONS) {
     if (record[dimension] !== undefined) pricing[dimension] = nonNegativeNumberField(record[dimension], `${label}.${dimension}`);
   }
+  if (record.tiers !== undefined) {
+    if (!isRecord(record.tiers)) throw new Error(`Malformed ${label}.tiers: must be an object`);
+    const tiers: Record<string, Partial<Record<BillingDimension, number>>> = {};
+    for (const [tierName, overlay] of Object.entries(record.tiers)) {
+      if (tierName === '') throw new Error(`Malformed ${label}.tiers: tier name must be non-empty`);
+      if (!isRecord(overlay)) throw new Error(`Malformed ${label}.tiers.${tierName}: must be an object`);
+      const tierPricing: Partial<Record<BillingDimension, number>> = {};
+      for (const dimension of BILLING_DIMENSIONS) {
+        if (overlay[dimension] !== undefined) {
+          tierPricing[dimension] = nonNegativeNumberField(overlay[dimension], `${label}.tiers.${tierName}.${dimension}`);
+        }
+      }
+      if (Object.keys(tierPricing).length > 0) tiers[tierName] = tierPricing;
+    }
+    if (Object.keys(tiers).length > 0) pricing.tiers = tiers;
+  }
   return Object.keys(pricing).length > 0 ? pricing : undefined;
 };
 
diff --git a/packages/provider/src/model-config_test.ts b/packages/provider/src/model-config_test.ts
new file mode 100644
index 000000000..8c25e695f
--- /dev/null
+++ b/packages/provider/src/model-config_test.ts
@@ -0,0 +1,56 @@
+import { test } from 'vitest';
+
+import { pricingField } from './model-config.ts';
+import { assertEquals, assertThrows } from '@floway-dev/test-utils';
+
+test('pricingField parses bare dimensions and drops empty objects', () => {
+  assertEquals(pricingField(undefined, 'cost'), undefined);
+  assertEquals(pricingField({}, 'cost'), undefined);
+  assertEquals(
+    pricingField({ input: 5, output: 25, input_cache_read: 0.5 }, 'cost'),
+    { input: 5, output: 25, input_cache_read: 0.5 },
+  );
+});
+
+test('pricingField parses per-tier overlays alongside base rates', () => {
+  const result = pricingField(
+    {
+      input: 5,
+      output: 25,
+      tiers: {
+        fast: { input: 30, output: 150 },
+        flex: { input: 2.5 },
+      },
+    },
+    'cost',
+  );
+  assertEquals(result, {
+    input: 5,
+    output: 25,
+    tiers: {
+      fast: { input: 30, output: 150 },
+      flex: { input: 2.5 },
+    },
+  });
+});
+
+test('pricingField drops empty tier overlays and skips unknown keys inside them', () => {
+  const result = pricingField(
+    {
+      input: 5,
+      tiers: {
+        fast: { input: 30, bogus_key: 99 },
+        priority: {},
+      },
+    },
+    'cost',
+  );
+  assertEquals(result, { input: 5, tiers: { fast: { input: 30 } } });
+});
+
+test('pricingField rejects non-object tiers, empty names, and negative rates', () => {
+  assertThrows(() => pricingField({ tiers: 'nope' }, 'cost'), Error, 'tiers');
+  assertThrows(() => pricingField({ tiers: { '': { input: 5 } } }, 'cost'), Error, 'tier name');
+  assertThrows(() => pricingField({ tiers: { fast: 1 } }, 'cost'), Error, 'tiers.fast');
+  assertThrows(() => pricingField({ tiers: { fast: { input: -1 } } }, 'cost'), Error, 'non-negative');
+});

From 3c6066eec2052a116bc1aa11d3950a819240fb6f Mon Sep 17 00:00:00 2001
From: Menci <mencici@msn.com>
Date: Sat, 20 Jun 2026 11:52:47 +0800
Subject: [PATCH 5/6] feat(web): per-tier pricing override editor on
 ModelEditor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Operators editing custom upstreams can now declare per-tier pricing
overlays directly from the dashboard. Each tier row carries a free-form
name (matching the wire value the upstream stamps onto
`usage.service_tier` / `usage.speed`) and a sparse set of dimension
rates that shadow the base pricing — absent rates fall through to the
base row, mirroring `resolveEffectivePricing` semantics.

Drafts are tracked in local component state rather than recomputed from
stored cost on every keystroke, so a tier whose name is still empty
stays on screen instead of being collapsed away by the save path's
non-empty-name filter. Duplicate tier names within one model render an
inline warning; the save path keeps the last entry per name.

Also surface `BillingDimension` as a top-level type on the SPA API
module and route the model editor's pricing dim list through it, so the
`tiers` field on `ModelPricing` no longer leaks into the dimension
labels record.
---
 apps/web/src/api/types.ts                     |   8 +-
 .../components/upstream-edit/ModelEditor.vue  | 151 +++++++++++++++++-
 2 files changed, 155 insertions(+), 4 deletions(-)

diff --git a/apps/web/src/api/types.ts b/apps/web/src/api/types.ts
index 069233d2f..2a5313067 100644
--- a/apps/web/src/api/types.ts
+++ b/apps/web/src/api/types.ts
@@ -23,7 +23,13 @@ export type ModelEndpointKey = keyof ModelEndpoints;
 
 // USD per million tokens, keyed by billing dimension.
 export type BillingDimension = 'input' | 'input_cache_read' | 'input_cache_write' | 'input_cache_write_1h' | 'input_image' | 'output' | 'output_image';
-export type ModelPricing = Partial<Record<BillingDimension, number>>;
+
+// Base rates plus per-tier overlays for OpenAI service tiers (`flex`,
+// `priority`, ...) and Anthropic fast mode. Each overlay is a partial that
+// shadows individual dimensions; absent keys fall through to the base rate.
+export interface ModelPricing extends Partial<Record<BillingDimension, number>> {
+  tiers?: Record<string, Partial<Record<BillingDimension, number>>>;
+}
 
 export interface UpstreamModelConfig {
   upstreamModelId: string;
diff --git a/apps/web/src/components/upstream-edit/ModelEditor.vue b/apps/web/src/components/upstream-edit/ModelEditor.vue
index aa70330cd..fb44ae10f 100644
--- a/apps/web/src/components/upstream-edit/ModelEditor.vue
+++ b/apps/web/src/components/upstream-edit/ModelEditor.vue
@@ -1,5 +1,5 @@
 <script setup lang="ts">
-import { computed } from 'vue';
+import { computed, ref, watch } from 'vue';
 
 import EndpointsField from './EndpointsField.vue';
 import FlagOverridesEditor from './FlagOverridesEditor.vue';
@@ -34,7 +34,7 @@ const kindOptions: { value: ModelKind; label: string }[] = [
   { value: 'image', label: 'Image' },
 ];
 
-const PRICING_LABELS: Record<string, string> = {
+const PRICING_LABELS: Record<BillingDimension, string> = {
   input: 'Input ($/MTok)',
   input_cache_read: 'Cache Read ($/MTok)',
   input_cache_write: 'Cache Write 5m ($/MTok)',
@@ -82,7 +82,7 @@ const updateLimit = (
   patch({ limits: Object.keys(limits).length > 0 ? limits : undefined });
 };
 
-const updateCost = (key: keyof ModelPricing, raw: string | number | null | undefined) => {
+const updateCost = (key: BillingDimension, raw: string | number | null | undefined) => {
   if (!config.value) return;
   const cost = { ...(config.value.cost ?? {}) } as Record<string, unknown>;
   const num = parseOptionalNumber(raw);
@@ -94,6 +94,87 @@ const updateCost = (key: keyof ModelPricing, raw: string | number | null | undef
   patch({ cost: hasAny ? (cost as ModelPricing) : undefined });
 };
 
+// Per-tier overlays. A tier overlay is a sparse pricing snapshot keyed by
+// dimension; declared fields shadow the base rate, absent fields fall
+// through. We hold drafts in local state (rather than recomputing from the
+// stored cost on every keystroke) so an in-progress tier whose name is still
+// empty stays on screen — `writeTierDrafts` skips empty-name entries, so a
+// purely-derived list would lose newly-added rows.
+interface TierDraft { name: string; rates: Partial<Record<BillingDimension, number>> }
+
+const tierDraftsFor = (cost: ModelPricing | undefined): TierDraft[] => {
+  const tiers = cost?.tiers;
+  if (!tiers) return [];
+  return Object.entries(tiers).map(([name, rates]) => ({ name, rates: { ...rates } }));
+};
+
+const tierDrafts = ref<TierDraft[]>(tierDraftsFor(config.value?.cost));
+
+// Resync the local drafts whenever the active row changes (a different model's
+// cost replaces the working set). Edits within the same row leave the drafts
+// alone — `writeTierDrafts` writes both local state and stored cost in lockstep.
+watch(() => props.row?.uiId, () => {
+  tierDrafts.value = tierDraftsFor(config.value?.cost);
+});
+
+const writeTierDrafts = (drafts: readonly TierDraft[]) => {
+  if (!config.value) return;
+  tierDrafts.value = drafts.map(d => ({ name: d.name, rates: { ...d.rates } }));
+  const base = { ...(config.value.cost ?? {}) } as ModelPricing;
+  delete base.tiers;
+  const tiers: Record<string, Partial<Record<BillingDimension, number>>> = {};
+  for (const draft of drafts) {
+    const trimmed = draft.name.trim();
+    if (!trimmed) continue;
+    // Last write wins on duplicate names — the validation message in the
+    // template tells the operator to rename collisions.
+    const rates: Partial<Record<BillingDimension, number>> = {};
+    for (const [k, v] of Object.entries(draft.rates)) {
+      if (typeof v === 'number' && Number.isFinite(v)) rates[k as BillingDimension] = v;
+    }
+    if (Object.keys(rates).length > 0) tiers[trimmed] = rates;
+  }
+  const next: ModelPricing = { ...base };
+  if (Object.keys(tiers).length > 0) next.tiers = tiers;
+  const hasAny = Object.entries(next).some(([k, v]) => k === 'tiers' ? Object.keys(v as object).length > 0 : v !== undefined);
+  patch({ cost: hasAny ? next : undefined });
+};
+
+const duplicateTierNames = computed<Set<string>>(() => {
+  const seen = new Map<string, number>();
+  for (const draft of tierDrafts.value) {
+    const name = draft.name.trim();
+    if (!name) continue;
+    seen.set(name, (seen.get(name) ?? 0) + 1);
+  }
+  return new Set([...seen.entries()].filter(([, count]) => count > 1).map(([name]) => name));
+});
+
+const updateTierName = (index: number, name: string) => {
+  const next = tierDrafts.value.map((draft, i) => i === index ? { ...draft, name } : draft);
+  writeTierDrafts(next);
+};
+
+const updateTierRate = (index: number, dim: BillingDimension, raw: string | number | null | undefined) => {
+  const num = parseOptionalNumber(raw);
+  const next = tierDrafts.value.map((draft, i) => {
+    if (i !== index) return draft;
+    const rates = { ...draft.rates };
+    if (num === undefined) delete rates[dim];
+    else rates[dim] = num;
+    return { ...draft, rates };
+  });
+  writeTierDrafts(next);
+};
+
+const addTier = () => {
+  writeTierDrafts([...tierDrafts.value, { name: '', rates: {} }]);
+};
+
+const removeTier = (index: number) => {
+  writeTierDrafts(tierDrafts.value.filter((_, i) => i !== index));
+};
+
 const toggleFlagOverridesEnabled = () => {
   if (!editable.value || !config.value) return;
   if (config.value.flagOverrides?.enabled) {
@@ -273,6 +354,70 @@ const updateFlagOverrides = (values: Record<string, boolean>) => {
           </div>
         </section>
 
+        <section>
+          <div class="mb-3 flex items-baseline gap-3">
+            <h3 class="text-[11px] font-semibold uppercase tracking-wider text-gray-500">Per-Tier Pricing Overrides</h3>
+            <span class="text-[11px] text-gray-500">
+              tier names match the wire value the upstream stamps onto usage
+              (<code class="font-mono">fast</code>, <code class="font-mono">flex</code>, <code class="font-mono">priority</code>, ...) — blank rates fall through to base
+            </span>
+            <Button
+              v-if="editable"
+              variant="secondary"
+              size="sm"
+              class="ml-auto"
+              @click="addTier"
+            >+ Add Tier</Button>
+          </div>
+          <div v-if="tierDrafts.length === 0" class="text-[11px] text-gray-600">
+            <template v-if="editable">No tiers defined. Add one to override pricing for requests stamped with a service tier.</template>
+            <template v-else>No tier overrides on this model.</template>
+          </div>
+          <div v-else class="space-y-3">
+            <div
+              v-for="(draft, index) in tierDrafts"
+              :key="index"
+              class="rounded border border-white/[0.06] bg-white/[0.02] p-3"
+            >
+              <div class="mb-3 flex items-end gap-3">
+                <label class="block flex-1 space-y-1.5">
+                  <span class="block text-xs font-medium text-gray-500">Tier Name</span>
+                  <Input
+                    :model-value="draft.name"
+                    :readonly="!editable"
+                    :invalid="duplicateTierNames.has(draft.name.trim())"
+                    placeholder="e.g. fast"
+                    class="font-mono"
+                    @update:model-value="v => updateTierName(index, v)"
+                  />
+                </label>
+                <Button
+                  v-if="editable"
+                  variant="danger"
+                  size="sm"
+                  @click="removeTier(index)"
+                >Remove</Button>
+              </div>
+              <p v-if="duplicateTierNames.has(draft.name.trim())" class="mb-2 text-[11px] text-accent-rose">
+                Duplicate tier name — only the last entry with this name is saved.
+              </p>
+              <div class="grid gap-3 sm:grid-cols-2 xl:grid-cols-4">
+                <label v-for="dim in PRICING_BY_KIND[rowKind]" :key="dim" class="block space-y-1.5">
+                  <span class="block text-xs font-medium text-gray-500">{{ PRICING_LABELS[dim] }}</span>
+                  <Input
+                    type="number"
+                    :model-value="draft.rates[dim]"
+                    :readonly="!editable"
+                    placeholder="inherit"
+                    class="font-mono"
+                    @update:model-value="v => updateTierRate(index, dim, v)"
+                  />
+                </label>
+              </div>
+            </div>
+          </div>
+        </section>
+
         <section>
           <div class="mb-3 flex items-baseline gap-3">
             <h3 class="text-[11px] font-semibold uppercase tracking-wider text-gray-500">Override Feature Flags</h3>

From 2b07bbe62ae4de7cbb1ea82d8b238043eadbf1c0 Mon Sep 17 00:00:00 2001
From: Menci <mencici@msn.com>
Date: Sun, 21 Jun 2026 02:01:11 +0800
Subject: [PATCH 6/6] feat(web): collapse Per-Tier Pricing Overrides by
 default; drop nested per-tier card
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per-tier overrides are a niche editing surface — most operators stay on
the base pricing for the model's lifetime. The section now collapses to
a single header line by default, with a chevron + tier-count badge so
operators editing a model with existing overrides still see what's
configured without expanding. Clicking + Add Tier auto-expands.

Inside the section, the previous per-tier card chrome (rounded border +
bg-white tint + extra padding) is gone — each tier reads as two flush
rows: tier-name input + Remove on row 1, the same Pricing grid on row 2.
Visually consistent with the base Pricing section above it; no nested
container framing.
---
 apps/web/src/api/types.ts                     |  14 +-
 .../components/upstream-edit/ModelEditor.vue  | 240 +++++++++++++-----
 packages/gateway/package.json                 |   1 +
 .../src/control-plane/pricing/types.ts        |   1 +
 packages/protocols/src/common/models.ts       |   6 +-
 5 files changed, 189 insertions(+), 73 deletions(-)
 create mode 100644 packages/gateway/src/control-plane/pricing/types.ts

diff --git a/apps/web/src/api/types.ts b/apps/web/src/api/types.ts
index 2a5313067..1116f7dcb 100644
--- a/apps/web/src/api/types.ts
+++ b/apps/web/src/api/types.ts
@@ -21,15 +21,11 @@ export interface ModelEndpoints {
 
 export type ModelEndpointKey = keyof ModelEndpoints;
 
-// USD per million tokens, keyed by billing dimension.
-export type BillingDimension = 'input' | 'input_cache_read' | 'input_cache_write' | 'input_cache_write_1h' | 'input_image' | 'output' | 'output_image';
-
-// Base rates plus per-tier overlays for OpenAI service tiers (`flex`,
-// `priority`, ...) and Anthropic fast mode. Each overlay is a partial that
-// shadows individual dimensions; absent keys fall through to the base rate.
-export interface ModelPricing extends Partial<Record<BillingDimension, number>> {
-  tiers?: Record<string, Partial<Record<BillingDimension, number>>>;
-}
+// USD per million tokens, keyed by billing dimension. Imported from the
+// gateway so the dashboard's pricing form stays locked to the same definition
+// the backend writes against — same pattern as `ProxyRecord` below.
+import type { BillingDimension, ModelPricing } from '@floway-dev/gateway/control-plane/pricing/types';
+export type { BillingDimension, ModelPricing };
 
 export interface UpstreamModelConfig {
   upstreamModelId: string;
diff --git a/apps/web/src/components/upstream-edit/ModelEditor.vue b/apps/web/src/components/upstream-edit/ModelEditor.vue
index fb44ae10f..856ccdea5 100644
--- a/apps/web/src/components/upstream-edit/ModelEditor.vue
+++ b/apps/web/src/components/upstream-edit/ModelEditor.vue
@@ -5,7 +5,7 @@ import EndpointsField from './EndpointsField.vue';
 import FlagOverridesEditor from './FlagOverridesEditor.vue';
 import { configOf, defaultEndpointsForKind, publicIdOf, titleFor, type Row } from './modelRows.ts';
 import type { BillingDimension, FlagDef, ModelKind, ModelPricing, UpstreamModelConfig, UpstreamProviderKind } from '../../api/types.ts';
-import { Button, Input, Select, Switch } from '@floway-dev/ui';
+import { Button, Input, Select, Switch, Tooltip } from '@floway-dev/ui';
 
 const props = defineProps<{
   row: Row | null;
@@ -37,8 +37,8 @@ const kindOptions: { value: ModelKind; label: string }[] = [
 const PRICING_LABELS: Record<BillingDimension, string> = {
   input: 'Input ($/MTok)',
   input_cache_read: 'Cache Read ($/MTok)',
-  input_cache_write: 'Cache Write 5m ($/MTok)',
-  input_cache_write_1h: 'Cache Write 1h ($/MTok)',
+  input_cache_write: 'Cache Write ($/MTok)',
+  input_cache_write_1h: 'Cache Write (1h) ($/MTok)',
   input_image: 'Image Input ($/MTok)',
   output: 'Output ($/MTok)',
   output_image: 'Image Output ($/MTok)',
@@ -67,7 +67,10 @@ const setKind = (k: ModelKind) => {
 const parseOptionalNumber = (raw: string | number | null | undefined): number | undefined => {
   if (raw === '' || raw === null || raw === undefined) return undefined;
   const num = Number(raw);
-  return Number.isFinite(num) ? num : undefined;
+  // Backend pricing validators reject negatives (see `nonNegativeNumberField`
+  // in packages/provider/src/model-config.ts); drop them at the form boundary
+  // so a typo doesn't stage data the next PUT will 400 on.
+  return Number.isFinite(num) && num >= 0 ? num : undefined;
 };
 
 const updateLimit = (
@@ -84,14 +87,19 @@ const updateLimit = (
 
 const updateCost = (key: BillingDimension, raw: string | number | null | undefined) => {
   if (!config.value) return;
-  const cost = { ...(config.value.cost ?? {}) } as Record<string, unknown>;
+  const cost = { ...(config.value.cost ?? {}) } as ModelPricing;
   const num = parseOptionalNumber(raw);
   if (num === undefined) delete cost[key];
   else cost[key] = num;
-  // Every dimension is independently optional. When all are empty we drop the
-  // whole object so the row stores `cost: undefined` rather than an empty stub.
-  const hasAny = Object.values(cost).some(v => v !== undefined);
-  patch({ cost: hasAny ? (cost as ModelPricing) : undefined });
+  // Every dimension is independently optional. The row stores `cost: undefined`
+  // rather than an empty stub when every base dimension AND the tiers overlay
+  // are empty. A bare check on `Object.values(cost)` would keep the row alive
+  // forever once any tier was added, because `cost.tiers` is a populated object
+  // even when every base rate is cleared.
+  const { tiers, ...base } = cost;
+  const hasBase = Object.values(base).some(v => v !== undefined);
+  const hasTiers = tiers !== undefined && Object.keys(tiers).length > 0;
+  patch({ cost: hasBase || hasTiers ? cost : undefined });
 };
 
 // Per-tier overlays. A tier overlay is a sparse pricing snapshot keyed by
@@ -99,27 +107,43 @@ const updateCost = (key: BillingDimension, raw: string | number | null | undefin
 // through. We hold drafts in local state (rather than recomputing from the
 // stored cost on every keystroke) so an in-progress tier whose name is still
 // empty stays on screen — `writeTierDrafts` skips empty-name entries, so a
-// purely-derived list would lose newly-added rows.
-interface TierDraft { name: string; rates: Partial<Record<BillingDimension, number>> }
+// purely-derived list would lose newly-added rows. Each draft also carries
+// a stable `id` separate from its name so removing a middle row doesn't
+// re-key its neighbors mid-edit (Vue would otherwise reuse one input's DOM
+// for another row's value).
+interface TierDraft { id: number; name: string; rates: Partial<Record<BillingDimension, number>> }
+
+let tierDraftIdSeq = 0;
+
+const hasFiniteRate = (rates: TierDraft['rates']): boolean =>
+  Object.values(rates).some(v => typeof v === 'number' && Number.isFinite(v));
 
 const tierDraftsFor = (cost: ModelPricing | undefined): TierDraft[] => {
   const tiers = cost?.tiers;
   if (!tiers) return [];
-  return Object.entries(tiers).map(([name, rates]) => ({ name, rates: { ...rates } }));
+  return Object.entries(tiers).map(([name, rates]) => ({ id: ++tierDraftIdSeq, name, rates: { ...rates } }));
 };
 
 const tierDrafts = ref<TierDraft[]>(tierDraftsFor(config.value?.cost));
 
+// Per-tier overrides are a niche editing surface — most operators stay on the
+// base pricing for the model's lifetime. Default the section collapsed on a
+// row with no overrides so the page reads as a base-pricing form; on a row
+// that already has overrides, default expanded so the operator sees them
+// without an extra click. An Add Tier click also auto-expands.
+const tierSectionExpanded = ref(tierDrafts.value.length > 0);
+
 // Resync the local drafts whenever the active row changes (a different model's
 // cost replaces the working set). Edits within the same row leave the drafts
 // alone — `writeTierDrafts` writes both local state and stored cost in lockstep.
 watch(() => props.row?.uiId, () => {
   tierDrafts.value = tierDraftsFor(config.value?.cost);
+  tierSectionExpanded.value = tierDrafts.value.length > 0;
 });
 
 const writeTierDrafts = (drafts: readonly TierDraft[]) => {
   if (!config.value) return;
-  tierDrafts.value = drafts.map(d => ({ name: d.name, rates: { ...d.rates } }));
+  tierDrafts.value = drafts.map(d => ({ id: d.id, name: d.name, rates: { ...d.rates } }));
   const base = { ...(config.value.cost ?? {}) } as ModelPricing;
   delete base.tiers;
   const tiers: Record<string, Partial<Record<BillingDimension, number>>> = {};
@@ -136,20 +160,45 @@ const writeTierDrafts = (drafts: readonly TierDraft[]) => {
   }
   const next: ModelPricing = { ...base };
   if (Object.keys(tiers).length > 0) next.tiers = tiers;
-  const hasAny = Object.entries(next).some(([k, v]) => k === 'tiers' ? Object.keys(v as object).length > 0 : v !== undefined);
-  patch({ cost: hasAny ? next : undefined });
+  patch({ cost: Object.keys(next).length > 0 ? next : undefined });
 };
 
 const duplicateTierNames = computed<Set<string>>(() => {
-  const seen = new Map<string, number>();
+  const seen = new Set<string>();
+  const dupes = new Set<string>();
   for (const draft of tierDrafts.value) {
     const name = draft.name.trim();
     if (!name) continue;
-    seen.set(name, (seen.get(name) ?? 0) + 1);
+    if (seen.has(name)) dupes.add(name);
+    else seen.add(name);
+  }
+  return dupes;
+});
+
+// Same predicate `writeTierDrafts` uses to decide whether a draft survives
+// into the persisted shape. The badge and any "this row will not save" hint
+// both key off this so what the dashboard surfaces matches what gets written.
+const isTierDraftPersistable = (draft: TierDraft): boolean =>
+  draft.name.trim() !== '' && hasFiniteRate(draft.rates);
+
+const effectiveTierCount = computed(() => {
+  const names = new Set<string>();
+  for (const draft of tierDrafts.value) {
+    if (isTierDraftPersistable(draft)) names.add(draft.name.trim());
   }
-  return new Set([...seen.entries()].filter(([, count]) => count > 1).map(([name]) => name));
+  return names.size;
 });
 
+const draftHasOrphanRates = (draft: TierDraft): boolean =>
+  draft.name.trim() === '' && hasFiniteRate(draft.rates);
+
+// Inverse of orphan-rates: name supplied but every rate left blank. Such a
+// row is silently dropped on save because `isTierDraftPersistable` requires
+// at least one finite rate. Surface the same inline warning so the operator
+// is not surprised when their tier "disappears" after reload.
+const draftHasOnlyName = (draft: TierDraft): boolean =>
+  draft.name.trim() !== '' && !hasFiniteRate(draft.rates);
+
 const updateTierName = (index: number, name: string) => {
   const next = tierDrafts.value.map((draft, i) => i === index ? { ...draft, name } : draft);
   writeTierDrafts(next);
@@ -168,13 +217,28 @@ const updateTierRate = (index: number, dim: BillingDimension, raw: string | numb
 };
 
 const addTier = () => {
-  writeTierDrafts([...tierDrafts.value, { name: '', rates: {} }]);
+  writeTierDrafts([...tierDrafts.value, { id: ++tierDraftIdSeq, name: '', rates: {} }]);
+  tierSectionExpanded.value = true;
 };
 
 const removeTier = (index: number) => {
   writeTierDrafts(tierDrafts.value.filter((_, i) => i !== index));
 };
 
+const moveTierUp = (index: number) => {
+  if (index <= 0) return;
+  const next = [...tierDrafts.value];
+  [next[index - 1], next[index]] = [next[index]!, next[index - 1]!];
+  writeTierDrafts(next);
+};
+
+const moveTierDown = (index: number) => {
+  if (index >= tierDrafts.value.length - 1) return;
+  const next = [...tierDrafts.value];
+  [next[index], next[index + 1]] = [next[index + 1]!, next[index]!];
+  writeTierDrafts(next);
+};
+
 const toggleFlagOverridesEnabled = () => {
   if (!editable.value || !config.value) return;
   if (config.value.flagOverrides?.enabled) {
@@ -344,6 +408,7 @@ const updateFlagOverrides = (values: Record<string, boolean>) => {
               <span class="block text-xs font-medium text-gray-500">{{ PRICING_LABELS[dim] }}</span>
               <Input
                 type="number"
+                min="0"
                 :model-value="config.cost?.[dim]"
                 :readonly="!editable"
                 placeholder="$/MTok"
@@ -356,11 +421,21 @@ const updateFlagOverrides = (values: Record<string, boolean>) => {
 
         <section>
           <div class="mb-3 flex items-baseline gap-3">
-            <h3 class="text-[11px] font-semibold uppercase tracking-wider text-gray-500">Per-Tier Pricing Overrides</h3>
-            <span class="text-[11px] text-gray-500">
-              tier names match the wire value the upstream stamps onto usage
-              (<code class="font-mono">fast</code>, <code class="font-mono">flex</code>, <code class="font-mono">priority</code>, ...) — blank rates fall through to base
-            </span>
+            <button
+              type="button"
+              class="flex items-baseline gap-2 text-[11px] font-semibold uppercase tracking-wider text-gray-500 hover:text-gray-300 transition-colors"
+              :aria-expanded="tierSectionExpanded"
+              aria-controls="tier-overrides-panel"
+              @click="tierSectionExpanded = !tierSectionExpanded"
+            >
+              <i :class="tierSectionExpanded ? 'i-lucide-chevron-down' : 'i-lucide-chevron-right'" class="size-3 self-center" />
+              <span>Per-Tier Pricing Overrides</span>
+              <span
+                v-if="effectiveTierCount > 0"
+                class="text-accent-cyan"
+                :aria-label="`${effectiveTierCount} tier override${effectiveTierCount === 1 ? '' : 's'} configured`"
+              >({{ effectiveTierCount }})</span>
+            </button>
             <Button
               v-if="editable"
               variant="secondary"
@@ -369,50 +444,91 @@ const updateFlagOverrides = (values: Record<string, boolean>) => {
               @click="addTier"
             >+ Add Tier</Button>
           </div>
-          <div v-if="tierDrafts.length === 0" class="text-[11px] text-gray-600">
-            <template v-if="editable">No tiers defined. Add one to override pricing for requests stamped with a service tier.</template>
-            <template v-else>No tier overrides on this model.</template>
-          </div>
-          <div v-else class="space-y-3">
-            <div
-              v-for="(draft, index) in tierDrafts"
-              :key="index"
-              class="rounded border border-white/[0.06] bg-white/[0.02] p-3"
-            >
-              <div class="mb-3 flex items-end gap-3">
-                <label class="block flex-1 space-y-1.5">
-                  <span class="block text-xs font-medium text-gray-500">Tier Name</span>
+          <div id="tier-overrides-panel" v-show="tierSectionExpanded">
+            <div v-if="tierDrafts.length === 0" class="text-[11px] text-gray-600">
+              <template v-if="editable">No tiers defined. Add one to override pricing for requests stamped with a service tier.</template>
+              <template v-else>No tier overrides on this model.</template>
+            </div>
+            <div v-else class="space-y-6">
+              <div
+                v-for="(draft, index) in tierDrafts"
+                :key="draft.id"
+              >
+                <div class="mb-3 flex items-center gap-3">
+                  <span class="shrink-0 text-xs font-medium text-gray-500">Tier</span>
                   <Input
                     :model-value="draft.name"
                     :readonly="!editable"
-                    :invalid="duplicateTierNames.has(draft.name.trim())"
+                    :invalid="duplicateTierNames.has(draft.name.trim()) || draftHasOrphanRates(draft) || draftHasOnlyName(draft)"
                     placeholder="e.g. fast"
-                    class="font-mono"
+                    class="max-w-xs font-mono"
                     @update:model-value="v => updateTierName(index, v)"
                   />
-                </label>
-                <Button
-                  v-if="editable"
-                  variant="danger"
-                  size="sm"
-                  @click="removeTier(index)"
-                >Remove</Button>
-              </div>
-              <p v-if="duplicateTierNames.has(draft.name.trim())" class="mb-2 text-[11px] text-accent-rose">
-                Duplicate tier name — only the last entry with this name is saved.
-              </p>
-              <div class="grid gap-3 sm:grid-cols-2 xl:grid-cols-4">
-                <label v-for="dim in PRICING_BY_KIND[rowKind]" :key="dim" class="block space-y-1.5">
-                  <span class="block text-xs font-medium text-gray-500">{{ PRICING_LABELS[dim] }}</span>
-                  <Input
-                    type="number"
-                    :model-value="draft.rates[dim]"
-                    :readonly="!editable"
-                    placeholder="inherit"
-                    class="font-mono"
-                    @update:model-value="v => updateTierRate(index, dim, v)"
-                  />
-                </label>
+                  <div v-if="editable" class="ml-auto flex items-center gap-1">
+                    <Tooltip content="Move up">
+                      <button
+                        type="button"
+                        class="inline-flex h-7 w-7 items-center justify-center rounded-md text-gray-600 transition-colors hover:bg-white/[0.04] hover:text-accent-cyan disabled:pointer-events-none disabled:opacity-30"
+                        :disabled="index === 0"
+                        aria-label="Move tier up"
+                        @click="moveTierUp(index)"
+                      >
+                        <svg class="h-3.5 w-3.5" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                          <path d="m18 15-6-6-6 6" />
+                        </svg>
+                      </button>
+                    </Tooltip>
+                    <Tooltip content="Move down">
+                      <button
+                        type="button"
+                        class="inline-flex h-7 w-7 items-center justify-center rounded-md text-gray-600 transition-colors hover:bg-white/[0.04] hover:text-accent-cyan disabled:pointer-events-none disabled:opacity-30"
+                        :disabled="index === tierDrafts.length - 1"
+                        aria-label="Move tier down"
+                        @click="moveTierDown(index)"
+                      >
+                        <svg class="h-3.5 w-3.5" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                          <path d="m6 9 6 6 6-6" />
+                        </svg>
+                      </button>
+                    </Tooltip>
+                    <Tooltip content="Remove">
+                      <button
+                        type="button"
+                        class="inline-flex h-7 w-7 items-center justify-center rounded-md text-gray-600 transition-colors hover:bg-white/[0.04] hover:text-accent-rose"
+                        aria-label="Remove tier"
+                        @click="removeTier(index)"
+                      >
+                        <svg class="h-3.5 w-3.5" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                          <path d="M18 6 6 18" />
+                          <path d="m6 6 12 12" />
+                        </svg>
+                      </button>
+                    </Tooltip>
+                  </div>
+                </div>
+                <p v-if="duplicateTierNames.has(draft.name.trim())" class="mb-2 text-[11px] text-accent-rose">
+                  Duplicate tier name — only the last entry with this name is saved.
+                </p>
+                <p v-else-if="draftHasOrphanRates(draft)" class="mb-2 text-[11px] text-accent-rose">
+                  Tier name required — this row's rates will not save.
+                </p>
+                <p v-else-if="draftHasOnlyName(draft)" class="mb-2 text-[11px] text-accent-rose">
+                  Set at least one rate — this row will not save.
+                </p>
+                <div class="grid gap-3 sm:grid-cols-2 xl:grid-cols-4">
+                  <label v-for="dim in PRICING_BY_KIND[rowKind]" :key="dim" class="block space-y-1.5">
+                    <span class="block text-xs font-medium text-gray-500">{{ PRICING_LABELS[dim] }}</span>
+                    <Input
+                      type="number"
+                      min="0"
+                      :model-value="draft.rates[dim]"
+                      :readonly="!editable"
+                      placeholder="inherit"
+                      class="font-mono"
+                      @update:model-value="v => updateTierRate(index, dim, v)"
+                    />
+                  </label>
+                </div>
               </div>
             </div>
           </div>
diff --git a/packages/gateway/package.json b/packages/gateway/package.json
index edc4b083b..72dbf732c 100644
--- a/packages/gateway/package.json
+++ b/packages/gateway/package.json
@@ -7,6 +7,7 @@
     ".": { "import": "./src/index.ts", "types": "./src/index.ts" },
     "./app-type": { "types": "./src/app.ts" },
     "./control-plane/proxies/serialize": { "types": "./src/control-plane/proxies/serialize.ts" },
+    "./control-plane/pricing/types": { "types": "./src/control-plane/pricing/types.ts" },
     "./data-plane/tools/web-search/types": {
       "import": "./src/data-plane/tools/web-search/types.ts",
       "types": "./src/data-plane/tools/web-search/types.ts"
diff --git a/packages/gateway/src/control-plane/pricing/types.ts b/packages/gateway/src/control-plane/pricing/types.ts
new file mode 100644
index 000000000..e7872c67f
--- /dev/null
+++ b/packages/gateway/src/control-plane/pricing/types.ts
@@ -0,0 +1 @@
+export type { BillingDimension, ModelPricing } from '@floway-dev/protocols/common';
diff --git a/packages/protocols/src/common/models.ts b/packages/protocols/src/common/models.ts
index 896733ffd..fddc80318 100644
--- a/packages/protocols/src/common/models.ts
+++ b/packages/protocols/src/common/models.ts
@@ -9,8 +9,10 @@
 // image cache dimensions on purpose — a live probe of Azure gpt-image-2
 // confirmed its usage object never emits cached fields.
 //
-// `input_cache_write` is the 5-minute (default) TTL bucket; `input_cache_write_1h`
-// is the explicit 1-hour bucket Anthropic surfaces under
+// `input_cache_write` is the generic cache-write bucket — protocols without
+// a TTL distinction land all their writes here, and on Anthropic it covers
+// the default (5-minute) TTL bucket. `input_cache_write_1h` is the explicit
+// 1-hour bucket Anthropic surfaces under
 // `cache_creation.ephemeral_1h_input_tokens` (extended-cache-ttl-2025-04-11).
 // They are disjoint subsets of `cache_creation_input_tokens`.
 export type BillingDimension = 'input' | 'input_cache_read' | 'input_cache_write' | 'input_cache_write_1h' | 'input_image' | 'output' | 'output_image';