comet-ml · jverre · Jun 9, 2026 · Jun 9, 2026
diff --git a/docs/metadata-schema.md b/docs/metadata-schema.md
@@ -254,19 +254,28 @@ CLAUDE.md / MEMORY.md / `.agents/` files loaded at session start.
 
 ## `cc.thinking` schema
 
-Per-turn aggregate of thinking blocks, grouped by model. (`effort` isn't
-in the transcript — left out until we add a SessionStart capture.)
+Per-turn aggregate of thinking tokens bucketed by effort level. Level is
+inferred from actual tokens per LLM call — the transcript does not expose
+the requested `budget_tokens`.
+
+| Level | Thinking tokens per call |
+|-------|--------------------------|
+| `minimal` | ≤ 500 |
+| `light` | 501 – 3 000 |
+| `medium` | 3 001 – 10 000 |
+| `heavy` | > 10 000 |
 
 ```jsonc
 "cc": {
   "thinking": {
     "summary": {
       "total_tokens": 9230,
-      "block_count":  18
+      "call_count":   5
     },
-    "by_model": [
-      { "model": "claude-opus-4-7",   "tokens": 8100, "block_count": 14 },
-      { "model": "claude-haiku-4-5",  "tokens": 1130, "block_count":  4 }
+    "by_level": [
+      { "level": "minimal", "tokens":  130, "call_count": 2 },
+      { "level": "light",   "tokens": 1000, "call_count": 1 },
+      { "level": "heavy",   "tokens": 8100, "call_count": 2 }
     ]
   }
 }
@@ -326,7 +335,7 @@ turn-level lane total.
 
 ## `cc.file_attachments` schema
 
-@-mentioned files and system-injected attachments (excluding skill bodies — those go under `cc.skills.loaded`).
+@-mentioned files and system-injected attachments (excluding skill bodies — those go under `cc.skills.loaded`), grouped by file extension.
 
 ```jsonc
 "cc": {
@@ -335,14 +344,10 @@ turn-level lane total.
       "total_tokens": 12400,
       "file_count":   4
     },
-    "files": [
-      {
-        "path":         "/Users/collinc/code/opik/apps/opik-frontend/src/v2/router.tsx",
-        "sha256":       "1234abcd…",
-        "body_tokens":  8200,
-        "content_type": "source"   // source | log | image | pdf | csv | other
-      }
-      // …
+    "by_type": [
+      { "ext": ".tsx",  "tokens": 8200, "file_count": 1 },
+      { "ext": ".md",   "tokens": 3100, "file_count": 2 },
+      { "ext": "other", "tokens": 1100, "file_count": 1 }  // no extension
     ]
   }
 }

diff --git a/src/extractors.go b/src/extractors.go
@@ -290,62 +290,89 @@ func readInstalledPluginPaths(home string) map[string]string {
 	return out
 }
 
-// extractThinkingSnapshot aggregates thinking-block tokens per model,
-// driven off the SAME per-block attribution that lands on each span's
-// cc.llm_call.attributed_output_tokens. This guarantees that
-// Σ attributed[thinking] over the trace == cc.thinking.summary.total_tokens.
-// `cc.thinking.{summary, by_model}`.
+// extractThinkingSnapshot aggregates thinking-block tokens bucketed by effort
+// level. Level is derived from actual thinking tokens per LLM call (the
+// transcript does not expose the requested budget_tokens).
 //
-// `parsed` should be the dedup-applied output of ParseAssistantMessages +
-// DeduplicateUsage on the turn's entries. Pass nil to reparse (for
-// callers that don't already have a cached slice).
+// Buckets: minimal ≤500, light 501–3 000, medium 3 001–10 000, heavy >10 000.
+//
+// `cc.thinking.{summary, by_level}`.
 func extractThinkingSnapshot(entries []TranscriptEntry, parsed []ParsedEntry) map[string]interface{} {
 	if parsed == nil {
 		parsed = ParseAssistantMessages(entries)
 		DeduplicateUsage(parsed)
 	}
 
-	type group struct {
-		tokens, blockCount int
-	}
-	byModel := map[string]*group{}
-	totalTokens, totalBlocks := 0, 0
-
+	// Sum thinking tokens per LLM call (MessageID).
+	callThinking := map[string]int{}
+	anonTokens := 0
 	for _, p := range parsed {
 		if p.ContentType != "thinking" {
 			continue
 		}
-		g, ok := byModel[p.Model]
-		if !ok {
-			g = &group{}
-			byModel[p.Model] = g
+		if p.MessageID == "" {
+			anonTokens += p.AttributedOutputTokens
+			continue
 		}
-		g.tokens += p.AttributedOutputTokens
-		g.blockCount++
-		totalTokens += p.AttributedOutputTokens
-		totalBlocks++
+		callThinking[p.MessageID] += p.AttributedOutputTokens
+	}
+	if anonTokens > 0 {
+		callThinking["__anon"] = anonTokens
 	}
-	if totalBlocks == 0 {
+	if len(callThinking) == 0 {
 		return nil
 	}
 
-	byModelOut := make([]map[string]interface{}, 0, len(byModel))
-	for m, g := range byModel {
-		byModelOut = append(byModelOut, map[string]interface{}{
-			"model":       m,
-			"tokens":      g.tokens,
-			"block_count": g.blockCount,
+	type levelGroup struct{ calls, tokens int }
+	byLevel := map[string]*levelGroup{}
+	totalTokens, totalCalls := 0, 0
+
+	for _, tok := range callThinking {
+		l := thinkingLevel(tok)
+		g, ok := byLevel[l]
+		if !ok {
+			g = &levelGroup{}
+			byLevel[l] = g
+		}
+		g.calls++
+		g.tokens += tok
+		totalTokens += tok
+		totalCalls++
+	}
+
+	order := []string{"minimal", "light", "medium", "heavy"}
+	byLevelOut := make([]map[string]interface{}, 0, len(byLevel))
+	for _, l := range order {
+		g, ok := byLevel[l]
+		if !ok {
+			continue
+		}
+		byLevelOut = append(byLevelOut, map[string]interface{}{
+			"level":      l,
+			"tokens":     g.tokens,
+			"call_count": g.calls,
 		})
 	}
-	sort.Slice(byModelOut, func(i, j int) bool {
-		return byModelOut[i]["tokens"].(int) > byModelOut[j]["tokens"].(int)
-	})
+
 	return map[string]interface{}{
 		"summary": map[string]interface{}{
 			"total_tokens": totalTokens,
-			"block_count":  totalBlocks,
+			"call_count":   totalCalls,
 		},
-		"by_model": byModelOut,
+		"by_level": byLevelOut,
+	}
+}
+
+func thinkingLevel(tokens int) string {
+	switch {
+	case tokens > 10000:
+		return "heavy"
+	case tokens > 3000:
+		return "medium"
+	case tokens > 500:
+		return "light"
+	default:
+		return "minimal"
 	}
 }
 
@@ -597,21 +624,20 @@ func promptBucket(tokens int) string {
 }
 
 // extractFileAttachmentsSnapshot returns @-mentioned + system-injected file
-// attachments this turn. Skill bodies are NOT here — they go under
-// cc.skills.loaded. `cc.file_attachments.{summary, files}`.
+// attachments this turn grouped by file extension. Skill bodies are NOT here —
+// they go under cc.skills.loaded. `cc.file_attachments.{summary, by_type}`.
 func extractFileAttachmentsSnapshot(entries []TranscriptEntry) map[string]interface{} {
-	files := []map[string]interface{}{}
-	total := 0
+	type group struct{ tokens, count int }
+	byExt := map[string]*group{}
+	total, fileCount := 0, 0
+
 	for _, e := range entries {
 		if e.Type != "attachment" || e.Attachment == nil {
 			continue
 		}
 		if e.Attachment.Type != "file" {
 			continue
 		}
-		// File attachment shape: attachment.content is a JSON object with
-		// a nested file.content string. The struct treats Content as
-		// RawMessage so we decode lazily here.
 		var wrapper struct {
 			File struct {
 				Path    string `json:"path,omitempty"`
@@ -621,26 +647,46 @@ func extractFileAttachmentsSnapshot(entries []TranscriptEntry) map[string]interf
 		if err := json.Unmarshal(e.Attachment.Content, &wrapper); err != nil {
 			continue
 		}
-		body := wrapper.File.Content
-		// Auto-detect — file attachments vary (source code, markdown, JSON, …).
-		tokens := tokEstimate(body)
-		files = append(files, map[string]interface{}{
-			"path":         wrapper.File.Path,
-			"sha256":       sha256hex(body),
-			"body_tokens":  tokens,
-			"content_type": "source", // bucket classification deferred — single bucket for now
-		})
+		tokens := tokEstimate(wrapper.File.Content)
+
+		ext := strings.ToLower(filepath.Ext(wrapper.File.Path))
+		if ext == "" {
+			ext = "other"
+		}
+
+		g, ok := byExt[ext]
+		if !ok {
+			g = &group{}
+			byExt[ext] = g
+		}
+		g.tokens += tokens
+		g.count++
 		total += tokens
+		fileCount++
 	}
-	if len(files) == 0 {
+
+	if fileCount == 0 {
 		return nil
 	}
+
+	byTypeOut := make([]map[string]interface{}, 0, len(byExt))
+	for ext, g := range byExt {
+		byTypeOut = append(byTypeOut, map[string]interface{}{
+			"ext":        ext,
+			"tokens":     g.tokens,
+			"file_count": g.count,
+		})
+	}
+	sort.Slice(byTypeOut, func(i, j int) bool {
+		return byTypeOut[i]["tokens"].(int) > byTypeOut[j]["tokens"].(int)
+	})
+
 	return map[string]interface{}{
 		"summary": map[string]interface{}{
 			"total_tokens": total,
-			"file_count":   len(files),
+			"file_count":   fileCount,
 		},
-		"files": files,
+		"by_type": byTypeOut,
 	}
 }
 

diff --git a/src/extractors_test.go b/src/extractors_test.go
@@ -81,6 +81,122 @@ func TestExtractOutputTokensSnapshotNilOnEmpty(t *testing.T) {
 	}
 }
 
+func TestExtractThinkingSnapshotByLevel(t *testing.T) {
+	// Three LLM calls with different thinking budgets:
+	//   msg1 → 200 tokens  (minimal)
+	//   msg2 → 2000 tokens (light)
+	//   msg3 → 15000 tokens (heavy)
+	makeEntry := func(msgID string, thinkingTokens int) TranscriptEntry {
+		return TranscriptEntry{
+			Type: "assistant",
+			Message: &Message{
+				ID:    msgID,
+				Model: "claude-sonnet-4-6",
+				Usage: &Usage{OutputTokens: thinkingTokens},
+				Content: ContentSlice{
+					{Type: "thinking", Thinking: "..."},
+				},
+			},
+		}
+	}
+	entries := []TranscriptEntry{
+		makeEntry("msg1", 200),
+		makeEntry("msg2", 2000),
+		makeEntry("msg3", 15000),
+	}
+	parsed := ParseAssistantMessages(entries)
+	DeduplicateUsage(parsed)
+
+	snap := extractThinkingSnapshot(entries, parsed)
+	if snap == nil {
+		t.Fatal("expected non-nil snapshot")
+	}
+
+	summary := snap["summary"].(map[string]interface{})
+	if total := summary["total_tokens"].(int); total != 17200 {
+		t.Errorf("total_tokens = %d, want 17200", total)
+	}
+	if calls := summary["call_count"].(int); calls != 3 {
+		t.Errorf("call_count = %d, want 3", calls)
+	}
+
+	byLevel := snap["by_level"].([]map[string]interface{})
+	levels := map[string]map[string]interface{}{}
+	for _, l := range byLevel {
+		levels[l["level"].(string)] = l
+	}
+
+	if _, ok := levels["minimal"]; !ok {
+		t.Error("expected minimal level")
+	}
+	if _, ok := levels["light"]; !ok {
+		t.Error("expected light level")
+	}
+	if _, ok := levels["heavy"]; !ok {
+		t.Error("expected heavy level")
+	}
+	if _, ok := levels["medium"]; ok {
+		t.Error("unexpected medium level")
+	}
+
+	if levels["minimal"]["call_count"].(int) != 1 {
+		t.Errorf("minimal call_count = %d, want 1", levels["minimal"]["call_count"])
+	}
+	if levels["heavy"]["tokens"].(int) != 15000 {
+		t.Errorf("heavy tokens = %d, want 15000", levels["heavy"]["tokens"])
+	}
+}
+
+func TestExtractFileAttachmentsSnapshotByType(t *testing.T) {
+	makeAttachment := func(path, content string) TranscriptEntry {
+		raw, _ := json.Marshal(map[string]interface{}{
+			"file": map[string]string{"path": path, "content": content},
+		})
+		return TranscriptEntry{
+			Type: "attachment",
+			Attachment: &Attachment{
+				Type:    "file",
+				Content: raw,
+			},
+		}
+	}
+	entries := []TranscriptEntry{
+		makeAttachment("/repo/main.go", "package main\nfunc main() {}"),
+		makeAttachment("/repo/util.go", "package main"),
+		makeAttachment("/repo/README.md", "# Hello"),
+		makeAttachment("/repo/Makefile", "build:"),  // no extension → "other"
+	}
+
+	snap := extractFileAttachmentsSnapshot(entries)
+	if snap == nil {
+		t.Fatal("expected non-nil snapshot")
+	}
+
+	summary := snap["summary"].(map[string]interface{})
+	if fc := summary["file_count"].(int); fc != 4 {
+		t.Errorf("file_count = %d, want 4", fc)
+	}
+
+	byType := snap["by_type"].([]map[string]interface{})
+	exts := map[string]map[string]interface{}{}
+	for _, row := range byType {
+		exts[row["ext"].(string)] = row
+	}
+
+	if _, ok := exts[".go"]; !ok {
+		t.Error("expected .go entry")
+	}
+	if _, ok := exts[".md"]; !ok {
+		t.Error("expected .md entry")
+	}
+	if _, ok := exts["other"]; !ok {
+		t.Error("expected other entry for Makefile")
+	}
+	if exts[".go"]["file_count"].(int) != 2 {
+		t.Errorf(".go file_count = %d, want 2", exts[".go"]["file_count"])
+	}
+}
+
 func TestExtractAgentsSnapshotPrefersFrontmatterName(t *testing.T) {
 	home := t.TempDir()
 	cwd := t.TempDir()