feat(cli): split cache-creation TTL and ship per-model time buckets (rollup v3)

Jannchie · Jannchie · commit 859686ae35ce · 2026-06-10T11:55:30.000+09:00
Anthropic prices a 1-hour ephemeral cache write at 2x input vs 1.25x for
the 5-minute default, so claude-code usage now carries the
cache_creation.ephemeral_5m/1h split (when reported) through MetricBag,
modelRollups, and the new modelBuckets array.

modelBuckets aggregate model.usage events per (15-min bucket, model) so
the server can place cost on the time it actually happened instead of
the session's last_event_at. schemaVersion bumps to 3; the hash change
re-uploads history on the next backfill as with v2.
diff --git a/packages/cli/src/adapters/claude-code.ts b/packages/cli/src/adapters/claude-code.ts
@@ -378,18 +378,43 @@ export function claudeUsageFromMessage(message: Record<string, unknown>): Partia
   const outputTokens = numberField(usage, 'output_tokens') || 0
   const cachedInputTokens = cacheCreationInputTokens + cacheReadInputTokens
   const totalInputTokens = inputTokens + cachedInputTokens
+  const cacheCreationSplit = claudeCacheCreationSplit(usage)
 
   return {
     tokensInput: totalInputTokens || undefined,
     tokensCachedInput: cachedInputTokens || undefined,
     tokensCacheCreationInput: cacheCreationInputTokens || undefined,
+    tokensCacheCreation5mInput: cacheCreationSplit?.fiveMinute,
+    tokensCacheCreation1hInput: cacheCreationSplit?.oneHour,
     tokensCacheReadInput: cacheReadInputTokens || undefined,
     tokensOutput: outputTokens || undefined,
     tokensTotal: totalInputTokens + outputTokens || undefined,
     modelCalls: 1,
   }
 }
 
+// Splits Anthropic's `cache_creation_input_tokens` total into 5m/1h TTL buckets
+// using the optional `usage.cache_creation: { ephemeral_5m_input_tokens,
+// ephemeral_1h_input_tokens }` breakdown. The split matters for cost: ccusage
+// (rust/crates/ccusage/src/cost.rs) prices the 5m portion at the cache_create
+// rate (~1.25x input) but the 1h portion at input * CACHE_CREATE_1H_INPUT_MULTIPLIER
+// (= 2.0), so collapsing them underestimates 1h writes by ~60%.
+//
+// Fault tolerance mirrors ccusage: the explicit ephemeral fields are trusted as
+// the per-TTL split (defaulting to 0), while the total stays
+// cache_creation_input_tokens. We only emit the split when the breakdown object
+// is present; otherwise the TTL split is unknown and left absent.
+function claudeCacheCreationSplit(usage: Record<string, unknown>): { fiveMinute: number, oneHour: number } | undefined {
+  const breakdown = usage.cache_creation
+  if (!isPlainObject(breakdown)) {
+    return undefined
+  }
+  return {
+    fiveMinute: numberField(breakdown, 'ephemeral_5m_input_tokens') || 0,
+    oneHour: numberField(breakdown, 'ephemeral_1h_input_tokens') || 0,
+  }
+}
+
 function claudeSubagentMetrics(
   toolResult: Record<string, unknown>,
   fallbackDurationMs: number | undefined,
@@ -401,6 +426,7 @@ function claudeSubagentMetrics(
   const outputTokens = numberField(usage, 'output_tokens') || 0
   const cachedInputTokens = cacheCreationInputTokens + cacheReadInputTokens
   const totalInputTokens = inputTokens + cachedInputTokens
+  const cacheCreationSplit = claudeCacheCreationSplit(usage)
   const durationMs = numberField(toolResult, 'totalDurationMs') || fallbackDurationMs
 
   return {
@@ -410,6 +436,8 @@ function claudeSubagentMetrics(
     tokensInput: totalInputTokens || undefined,
     tokensCachedInput: cachedInputTokens || undefined,
     tokensCacheCreationInput: cacheCreationInputTokens || undefined,
+    tokensCacheCreation5mInput: cacheCreationSplit?.fiveMinute,
+    tokensCacheCreation1hInput: cacheCreationSplit?.oneHour,
     tokensCacheReadInput: cacheReadInputTokens || undefined,
     tokensOutput: outputTokens || undefined,
     tokensTotal: numberField(toolResult, 'totalTokens') || totalInputTokens + outputTokens || undefined,
diff --git a/packages/cli/src/backfill/rollup.ts b/packages/cli/src/backfill/rollup.ts
@@ -40,6 +40,8 @@ function buildSessionRollup(rollupKey: string, events: CanonicalEvent[]): Sessio
   const lastEventAt = ordered.at(-1)?.ts || startedAt
   const timeBuckets = new Map<string, SessionRollup['timeBuckets'][number]>()
   const modelRollups = new Map<string, SessionRollup['modelRollups'][number]>()
+  // Keyed by `${bucketTs}\0${model}` (v3 per-(15-min bucket, model) token buckets).
+  const modelBuckets = new Map<string, NonNullable<SessionRollup['modelBuckets']>[number]>()
   const toolRollups = new Map<string, SessionRollup['toolRollups'][number]>()
   const fileRollups = new Map<string, SessionRollup['fileRollups'][number]>()
   const turnRollups = new Map<string, NonNullable<SessionRollup['turnRollups']>[number]>()
@@ -65,6 +67,9 @@ function buildSessionRollup(rollupKey: string, events: CanonicalEvent[]): Sessio
     const eventInputTokens = Math.max(0, event.metrics?.tokensInput || 0)
     const eventCachedInputTokens = Math.max(0, event.metrics?.tokensCachedInput || 0)
     const eventCacheCreationInputTokens = Math.max(0, event.metrics?.tokensCacheCreationInput || 0)
+    // TTL split subsets of cacheCreation; 0 when the agent doesn't report them.
+    const eventCacheCreation5mInputTokens = Math.max(0, event.metrics?.tokensCacheCreation5mInput || 0)
+    const eventCacheCreation1hInputTokens = Math.max(0, event.metrics?.tokensCacheCreation1hInput || 0)
     const eventCacheReadInputTokens = Math.max(0, event.metrics?.tokensCacheReadInput || 0)
     const eventOutputTokens = Math.max(0, event.metrics?.tokensOutput || 0)
     const eventReasoningOutputTokens = Math.max(0, event.metrics?.tokensReasoningOutput || 0)
@@ -197,6 +202,8 @@ function buildSessionRollup(rollupKey: string, events: CanonicalEvent[]): Sessio
         inputTokens: 0,
         cachedInputTokens: 0,
         cacheCreationInputTokens: 0,
+        cacheCreation5mInputTokens: 0,
+        cacheCreation1hInputTokens: 0,
         cacheReadInputTokens: 0,
         outputTokens: 0,
         reasoningOutputTokens: 0,
@@ -207,12 +214,42 @@ function buildSessionRollup(rollupKey: string, events: CanonicalEvent[]): Sessio
       modelRollup.inputTokens += eventInputTokens
       modelRollup.cachedInputTokens += eventCachedInputTokens
       modelRollup.cacheCreationInputTokens += eventCacheCreationInputTokens
+      modelRollup.cacheCreation5mInputTokens! += eventCacheCreation5mInputTokens
+      modelRollup.cacheCreation1hInputTokens! += eventCacheCreation1hInputTokens
       modelRollup.cacheReadInputTokens += eventCacheReadInputTokens
       modelRollup.outputTokens += eventOutputTokens
       modelRollup.reasoningOutputTokens += eventReasoningOutputTokens
       modelRollup.totalTokens += eventTotalTokens
       modelRollup.estimatedCostUsd += eventCostUsd
       modelRollups.set(modelKey, modelRollup)
+
+      // Per-(15-min bucket, model) token bucket (v3).
+      const modelBucketKey = `${bucketTs}\0${modelKey}`
+      const modelBucket = modelBuckets.get(modelBucketKey) || {
+        ts: bucketTs,
+        model: modelKey,
+        callCount: 0,
+        inputTokens: 0,
+        cachedInputTokens: 0,
+        cacheCreationInputTokens: 0,
+        cacheCreation5mInputTokens: 0,
+        cacheCreation1hInputTokens: 0,
+        cacheReadInputTokens: 0,
+        outputTokens: 0,
+        reasoningOutputTokens: 0,
+        totalTokens: 0,
+      }
+      modelBucket.callCount += 1
+      modelBucket.inputTokens += eventInputTokens
+      modelBucket.cachedInputTokens += eventCachedInputTokens
+      modelBucket.cacheCreationInputTokens += eventCacheCreationInputTokens
+      modelBucket.cacheCreation5mInputTokens += eventCacheCreation5mInputTokens
+      modelBucket.cacheCreation1hInputTokens += eventCacheCreation1hInputTokens
+      modelBucket.cacheReadInputTokens += eventCacheReadInputTokens
+      modelBucket.outputTokens += eventOutputTokens
+      modelBucket.reasoningOutputTokens += eventReasoningOutputTokens
+      modelBucket.totalTokens += eventTotalTokens
+      modelBuckets.set(modelBucketKey, modelBucket)
     }
     if (event.type === 'tool.started') {
       bucket.toolCalls += 1
@@ -284,11 +321,12 @@ function buildSessionRollup(rollupKey: string, events: CanonicalEvent[]): Sessio
   const baseRollup: SessionRollup = {
     rollupKey,
     payloadHash: '',
-    // v2 schema: trustworthy gap-clamped turn durations + billable-output token
-    // convention. Set on baseRollup (not after) so it participates in payloadHash:
-    // every historical rollup's hash changes, and a re-backfill (uploaded with
-    // replace=true by default) cleanly refreshes all data onto the new convention.
-    // This full-refresh churn is intentional.
+    // v3 schema: v2 (gap-clamped turn durations + billable-output token
+    // convention) plus per-model cache-creation TTL split and modelBuckets. Set on
+    // baseRollup (not after) so it participates in payloadHash: every historical
+    // rollup's hash changes, and a re-backfill (uploaded with replace=true by
+    // default) cleanly refreshes all data onto the new convention. This
+    // full-refresh churn is intentional.
     schemaVersion: AGENT_ROLLUP_SCHEMA_VERSION,
     source: first.source,
     project,
@@ -313,6 +351,8 @@ function buildSessionRollup(rollupKey: string, events: CanonicalEvent[]): Sessio
     durationMs: Math.max(0, Date.parse(lastEventAt) - Date.parse(startedAt)),
     timeBuckets: [...timeBuckets.values()].sort((a, b) => a.ts.localeCompare(b.ts)),
     modelRollups: [...modelRollups.values()].sort((a, b) => b.callCount - a.callCount || a.model.localeCompare(b.model)),
+    // Sorted ts ascending, then model lexicographically (wire contract).
+    modelBuckets: [...modelBuckets.values()].sort((a, b) => a.ts.localeCompare(b.ts) || a.model.localeCompare(b.model)),
     toolRollups: [...toolRollups.values()].sort((a, b) => b.callCount - a.callCount || a.tool.localeCompare(b.tool)),
     fileRollups: [...fileRollups.values()].sort((a, b) => b.writes - a.writes || b.reads - a.reads || a.displayPath.localeCompare(b.displayPath)),
     turnRollups: [...turnRollups.values()]
diff --git a/packages/cli/test/claude-code.test.ts b/packages/cli/test/claude-code.test.ts
@@ -50,6 +50,40 @@ test('parity: claude usage decomposes cache and keeps tokensInput cache-inclusiv
   assert.equal(usages[0].metrics?.tokensCacheReadInput, 5)
   assert.equal(usages[0].metrics?.tokensOutput, 4)
   assert.equal(usages[0].metrics?.tokensTotal, 17 + 4)
+  // No usage.cache_creation breakdown -> TTL split is left absent.
+  assert.equal(usages[0].metrics?.tokensCacheCreation5mInput, undefined)
+  assert.equal(usages[0].metrics?.tokensCacheCreation1hInput, undefined)
+})
+
+test('claude usage splits cache creation by TTL when usage.cache_creation is present', async () => {
+  const usages = usageEvents(await parse([
+    {
+      type: 'assistant',
+      sessionId: 's1',
+      timestamp: '2026-03-29T07:00:00.000Z',
+      requestId: 'req-ttl',
+      message: {
+        id: 'msg-ttl',
+        model: 'claude-sonnet-4-20250514',
+        usage: {
+          input_tokens: 10,
+          cache_creation_input_tokens: 300,
+          cache_creation: { ephemeral_5m_input_tokens: 100, ephemeral_1h_input_tokens: 200 },
+          cache_read_input_tokens: 5,
+          output_tokens: 4,
+        },
+      },
+    },
+  ]))
+
+  assert.equal(usages.length, 1)
+  // The total stays cache_creation_input_tokens; the split fills the subset fields.
+  assert.equal(usages[0].metrics?.tokensCacheCreationInput, 300)
+  assert.equal(usages[0].metrics?.tokensCacheCreation5mInput, 100)
+  assert.equal(usages[0].metrics?.tokensCacheCreation1hInput, 200)
+  // Cache-inclusive input and cached totals are unchanged by the split.
+  assert.equal(usages[0].metrics?.tokensInput, 10 + 300 + 5)
+  assert.equal(usages[0].metrics?.tokensCachedInput, 300 + 5)
 })
 
 test('parity: claude cache-only usage (from ccusage propagates_sidechain_metadata fixture)', async () => {
diff --git a/packages/cli/test/codex.test.ts b/packages/cli/test/codex.test.ts
@@ -109,12 +109,98 @@ test('turn duration sums gap-clamped active intervals', () => {
   assert.equal(turn.lastEventAt, '2026-01-02T00:23:00.000Z')
 })
 
-test('session rollups carry the v2 schemaVersion', () => {
+test('session rollups carry the v3 schemaVersion', () => {
   const rollup = buildSessionRollups([
     turnEvent('schema-session', 'schema-turn', '2026-01-02T00:00:00.000Z', 'prompt.submitted'),
     turnEvent('schema-session', 'schema-turn', '2026-01-02T00:00:01.000Z', 'turn.completed'),
   ])[0]
-  assert.equal(rollup.schemaVersion, 2)
+  assert.equal(rollup.schemaVersion, 3)
+})
+
+// Synthetic model.usage event with a metric bag (v3 modelBuckets / TTL split).
+function modelUsageEvent(
+  ts: string,
+  model: string,
+  metrics: NonNullable<CanonicalEvent['metrics']>,
+): CanonicalEvent {
+  return {
+    schemaVersion: '2026-04-29',
+    ts,
+    type: 'model.usage',
+    source: 'claude-code',
+    agent: 'claude-code',
+    sessionId: 'bucket-session',
+    model,
+    metrics,
+    refs: { sourcePathHash: 'sha256:bucket-session' },
+  }
+}
+
+test('rollup builds modelBuckets grouped by 15-min bucket and model, with TTL split', () => {
+  // Two models across two 15-min buckets (00:00 and 00:15), with a repeated
+  // (bucket, model) pair to exercise summing.
+  const rollup = buildSessionRollups([
+    modelUsageEvent('2026-01-02T00:01:00.000Z', 'model-a', {
+      tokensInput: 10,
+      tokensCachedInput: 7,
+      tokensCacheCreationInput: 300,
+      tokensCacheCreation5mInput: 100,
+      tokensCacheCreation1hInput: 200,
+      tokensCacheReadInput: 5,
+      tokensOutput: 4,
+      tokensReasoningOutput: 1,
+      tokensTotal: 21,
+    }),
+    modelUsageEvent('2026-01-02T00:14:00.000Z', 'model-a', {
+      tokensInput: 5,
+      tokensCacheCreationInput: 50,
+      tokensCacheCreation5mInput: 50,
+      tokensCacheCreation1hInput: 0,
+      tokensOutput: 2,
+      tokensTotal: 7,
+    }),
+    modelUsageEvent('2026-01-02T00:20:00.000Z', 'model-b', {
+      tokensInput: 8,
+      tokensOutput: 3,
+      tokensTotal: 11,
+    }),
+  ])[0]
+
+  // Sorted ts ascending, then model lexicographically.
+  const buckets = rollup.modelBuckets!
+  assert.equal(buckets.length, 2)
+  assert.deepEqual(buckets.map(b => [b.ts, b.model]), [
+    ['2026-01-02T00:00:00.000Z', 'model-a'],
+    ['2026-01-02T00:15:00.000Z', 'model-b'],
+  ])
+
+  // First bucket merges both 00:00-window model-a events.
+  const [a, b] = buckets
+  assert.equal(a.callCount, 2)
+  assert.equal(a.inputTokens, 15)
+  assert.equal(a.cacheCreationInputTokens, 350)
+  assert.equal(a.cacheCreation5mInputTokens, 150)
+  assert.equal(a.cacheCreation1hInputTokens, 200)
+  assert.equal(a.cacheReadInputTokens, 5)
+  assert.equal(a.outputTokens, 6)
+  assert.equal(a.reasoningOutputTokens, 1)
+  assert.equal(a.totalTokens, 28)
+
+  // Second bucket is the single model-b event with no TTL split reported.
+  assert.equal(b.callCount, 1)
+  assert.equal(b.inputTokens, 8)
+  assert.equal(b.cacheCreation5mInputTokens, 0)
+  assert.equal(b.cacheCreation1hInputTokens, 0)
+  assert.equal(b.totalTokens, 11)
+
+  // modelRollups accumulate the TTL split across buckets.
+  const modelA = rollup.modelRollups.find(m => m.model === 'model-a')!
+  assert.equal(modelA.cacheCreationInputTokens, 350)
+  assert.equal(modelA.cacheCreation5mInputTokens, 150)
+  assert.equal(modelA.cacheCreation1hInputTokens, 200)
+  const modelB = rollup.modelRollups.find(m => m.model === 'model-b')!
+  assert.equal(modelB.cacheCreation5mInputTokens, 0)
+  assert.equal(modelB.cacheCreation1hInputTokens, 0)
 })
 
 // ── ccusage parity ──
diff --git a/packages/shared/src/index.ts b/packages/shared/src/index.ts
@@ -9,7 +9,9 @@ export const AGENT_TIME_SCHEMA_VERSION = '2026-04-29'
 // treat the data: v1 rollups still get the legacy 15-min per-turn cap applied,
 // while v2 rollups are trusted as-is. Older CLIs omit the field; the server reads
 // a missing schemaVersion as 1.
-export const AGENT_ROLLUP_SCHEMA_VERSION = 2
+// v3 = v2 + per-model cache-creation TTL split (cacheCreation5m/1hInputTokens on
+// modelRollups) + modelBuckets (15-min aligned per-model token buckets).
+export const AGENT_ROLLUP_SCHEMA_VERSION = 3
 
 export const KNOWN_AGENT_SOURCES = [
   'codex',
@@ -249,6 +251,11 @@ export interface MetricBag {
   tokensOutput?: number
   tokensCachedInput?: number
   tokensCacheCreationInput?: number
+  // tokensCacheCreation5mInput / tokensCacheCreation1hInput are subsets of
+  // tokensCacheCreationInput split by cache write TTL. Absent when the agent
+  // doesn't report the TTL split (only Anthropic surfaces usage.cache_creation).
+  tokensCacheCreation5mInput?: number
+  tokensCacheCreation1hInput?: number
   tokensCacheReadInput?: number
   // tokensReasoningOutput is an informational subset of tokensOutput. It is NOT
   // added on top of tokensOutput for billing/total purposes — it only exposes how
@@ -411,13 +418,35 @@ export interface SessionModelRollup {
   inputTokens: number
   cachedInputTokens: number
   cacheCreationInputTokens: number
+  // 5-minute-TTL cache writes (subset of cacheCreationInputTokens).
+  cacheCreation5mInputTokens?: number
+  // 1-hour-TTL cache writes (subset of cacheCreationInputTokens).
+  cacheCreation1hInputTokens?: number
   cacheReadInputTokens: number
   outputTokens: number
   reasoningOutputTokens: number
   totalTokens: number
   estimatedCostUsd: number
 }
 
+// Per-(15-min bucket, model) token aggregation. The ts field uses the same
+// 15-min alignment (floorRollupBucket) as SessionTimeBucketRollup. Added in v3
+// so the server can attribute cache-creation TTL splits per model over time.
+export interface SessionModelBucketRollup {
+  ts: string
+  model: string
+  callCount: number
+  inputTokens: number
+  cachedInputTokens: number
+  cacheCreationInputTokens: number
+  cacheCreation5mInputTokens: number
+  cacheCreation1hInputTokens: number
+  cacheReadInputTokens: number
+  outputTokens: number
+  reasoningOutputTokens: number
+  totalTokens: number
+}
+
 export interface SessionToolRollup {
   tool: string
   callCount: number
@@ -480,6 +509,9 @@ export interface SessionRollup {
   durationMs: number
   timeBuckets: SessionTimeBucketRollup[]
   modelRollups: SessionModelRollup[]
+  // 15-min aligned per-model token buckets (v3). Optional so the type still
+  // describes rollups produced by older CLIs, which omit it.
+  modelBuckets?: SessionModelBucketRollup[]
   toolRollups: SessionToolRollup[]
   fileRollups: SessionFileRollup[]
   turnRollups?: SessionTurnRollup[]