Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 20 additions & 15 deletions docs/metadata-schema.md
Original file line number Diff line number Diff line change
Expand Up @@ -254,19 +254,28 @@ CLAUDE.md / MEMORY.md / `.agents/` files loaded at session start.

## `cc.thinking` schema

Per-turn aggregate of thinking blocks, grouped by model. (`effort` isn't
in the transcript — left out until we add a SessionStart capture.)
Per-turn aggregate of thinking tokens bucketed by effort level. Level is
inferred from actual tokens per LLM call — the transcript does not expose
the requested `budget_tokens`.

| Level | Thinking tokens per call |
|-------|--------------------------|
| `minimal` | ≤ 500 |
| `light` | 501 – 3 000 |
| `medium` | 3 001 – 10 000 |
| `heavy` | > 10 000 |

```jsonc
"cc": {
"thinking": {
"summary": {
"total_tokens": 9230,
"block_count": 18
"call_count": 5
},
"by_model": [
{ "model": "claude-opus-4-7", "tokens": 8100, "block_count": 14 },
{ "model": "claude-haiku-4-5", "tokens": 1130, "block_count": 4 }
"by_level": [
{ "level": "minimal", "tokens": 130, "call_count": 2 },
{ "level": "light", "tokens": 1000, "call_count": 1 },
{ "level": "heavy", "tokens": 8100, "call_count": 2 }
]
}
}
Expand Down Expand Up @@ -326,7 +335,7 @@ turn-level lane total.

## `cc.file_attachments` schema

@-mentioned files and system-injected attachments (excluding skill bodies — those go under `cc.skills.loaded`).
@-mentioned files and system-injected attachments (excluding skill bodies — those go under `cc.skills.loaded`), grouped by file extension.

```jsonc
"cc": {
Expand All @@ -335,14 +344,10 @@ turn-level lane total.
"total_tokens": 12400,
"file_count": 4
},
"files": [
{
"path": "/Users/collinc/code/opik/apps/opik-frontend/src/v2/router.tsx",
"sha256": "1234abcd…",
"body_tokens": 8200,
"content_type": "source" // source | log | image | pdf | csv | other
}
// …
"by_type": [
{ "ext": ".tsx", "tokens": 8200, "file_count": 1 },
{ "ext": ".md", "tokens": 3100, "file_count": 2 },
{ "ext": "other", "tokens": 1100, "file_count": 1 } // no extension
]
}
}
Expand Down
152 changes: 99 additions & 53 deletions src/extractors.go
Original file line number Diff line number Diff line change
Expand Up @@ -290,62 +290,89 @@ func readInstalledPluginPaths(home string) map[string]string {
return out
}

// extractThinkingSnapshot aggregates thinking-block tokens per model,
// driven off the SAME per-block attribution that lands on each span's
// cc.llm_call.attributed_output_tokens. This guarantees that
// Σ attributed[thinking] over the trace == cc.thinking.summary.total_tokens.
// `cc.thinking.{summary, by_model}`.
// extractThinkingSnapshot aggregates thinking-block tokens bucketed by effort
// level. Level is derived from actual thinking tokens per LLM call (the
// transcript does not expose the requested budget_tokens).
//
// `parsed` should be the dedup-applied output of ParseAssistantMessages +
// DeduplicateUsage on the turn's entries. Pass nil to reparse (for
// callers that don't already have a cached slice).
// Buckets: minimal ≤500, light 501–3 000, medium 3 001–10 000, heavy >10 000.
//
// `cc.thinking.{summary, by_level}`.
func extractThinkingSnapshot(entries []TranscriptEntry, parsed []ParsedEntry) map[string]interface{} {
if parsed == nil {
parsed = ParseAssistantMessages(entries)
DeduplicateUsage(parsed)
}

type group struct {
tokens, blockCount int
}
byModel := map[string]*group{}
totalTokens, totalBlocks := 0, 0

// Sum thinking tokens per LLM call (MessageID).
callThinking := map[string]int{}
anonTokens := 0
for _, p := range parsed {
if p.ContentType != "thinking" {
continue
}
g, ok := byModel[p.Model]
if !ok {
g = &group{}
byModel[p.Model] = g
if p.MessageID == "" {
anonTokens += p.AttributedOutputTokens
continue
}
g.tokens += p.AttributedOutputTokens
g.blockCount++
totalTokens += p.AttributedOutputTokens
totalBlocks++
callThinking[p.MessageID] += p.AttributedOutputTokens
}
if anonTokens > 0 {
callThinking["__anon"] = anonTokens
}
if totalBlocks == 0 {
if len(callThinking) == 0 {
return nil
}

byModelOut := make([]map[string]interface{}, 0, len(byModel))
for m, g := range byModel {
byModelOut = append(byModelOut, map[string]interface{}{
"model": m,
"tokens": g.tokens,
"block_count": g.blockCount,
type levelGroup struct{ calls, tokens int }
byLevel := map[string]*levelGroup{}
totalTokens, totalCalls := 0, 0

for _, tok := range callThinking {
l := thinkingLevel(tok)
g, ok := byLevel[l]
if !ok {
g = &levelGroup{}
byLevel[l] = g
}
g.calls++
g.tokens += tok
totalTokens += tok
totalCalls++
}

order := []string{"minimal", "light", "medium", "heavy"}
byLevelOut := make([]map[string]interface{}, 0, len(byLevel))
for _, l := range order {
g, ok := byLevel[l]
if !ok {
continue
}
byLevelOut = append(byLevelOut, map[string]interface{}{
"level": l,
"tokens": g.tokens,
"call_count": g.calls,
})
}
sort.Slice(byModelOut, func(i, j int) bool {
return byModelOut[i]["tokens"].(int) > byModelOut[j]["tokens"].(int)
})

return map[string]interface{}{
"summary": map[string]interface{}{
"total_tokens": totalTokens,
"block_count": totalBlocks,
"call_count": totalCalls,
},
"by_model": byModelOut,
"by_level": byLevelOut,
}
}

func thinkingLevel(tokens int) string {
switch {
case tokens > 10000:
return "heavy"
case tokens > 3000:
return "medium"
case tokens > 500:
return "light"
default:
return "minimal"
}
}

Expand Down Expand Up @@ -597,21 +624,20 @@ func promptBucket(tokens int) string {
}

// extractFileAttachmentsSnapshot returns @-mentioned + system-injected file
// attachments this turn. Skill bodies are NOT here — they go under
// cc.skills.loaded. `cc.file_attachments.{summary, files}`.
// attachments this turn grouped by file extension. Skill bodies are NOT here —
// they go under cc.skills.loaded. `cc.file_attachments.{summary, by_type}`.
func extractFileAttachmentsSnapshot(entries []TranscriptEntry) map[string]interface{} {
files := []map[string]interface{}{}
total := 0
type group struct{ tokens, count int }
byExt := map[string]*group{}
total, fileCount := 0, 0

for _, e := range entries {
if e.Type != "attachment" || e.Attachment == nil {
continue
}
if e.Attachment.Type != "file" {
continue
}
// File attachment shape: attachment.content is a JSON object with
// a nested file.content string. The struct treats Content as
// RawMessage so we decode lazily here.
var wrapper struct {
File struct {
Path string `json:"path,omitempty"`
Expand All @@ -621,26 +647,46 @@ func extractFileAttachmentsSnapshot(entries []TranscriptEntry) map[string]interf
if err := json.Unmarshal(e.Attachment.Content, &wrapper); err != nil {
continue
}
body := wrapper.File.Content
// Auto-detect — file attachments vary (source code, markdown, JSON, …).
tokens := tokEstimate(body)
files = append(files, map[string]interface{}{
"path": wrapper.File.Path,
"sha256": sha256hex(body),
"body_tokens": tokens,
"content_type": "source", // bucket classification deferred — single bucket for now
})
tokens := tokEstimate(wrapper.File.Content)

ext := strings.ToLower(filepath.Ext(wrapper.File.Path))
if ext == "" {
ext = "other"
}

g, ok := byExt[ext]
if !ok {
g = &group{}
byExt[ext] = g
}
g.tokens += tokens
g.count++
total += tokens
fileCount++
}
if len(files) == 0 {

if fileCount == 0 {
return nil
}

byTypeOut := make([]map[string]interface{}, 0, len(byExt))
for ext, g := range byExt {
byTypeOut = append(byTypeOut, map[string]interface{}{
"ext": ext,
"tokens": g.tokens,
"file_count": g.count,
})
}
sort.Slice(byTypeOut, func(i, j int) bool {
return byTypeOut[i]["tokens"].(int) > byTypeOut[j]["tokens"].(int)
})

return map[string]interface{}{
"summary": map[string]interface{}{
"total_tokens": total,
"file_count": len(files),
"file_count": fileCount,
},
"files": files,
"by_type": byTypeOut,
}
}

Expand Down
116 changes: 116 additions & 0 deletions src/extractors_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,122 @@ func TestExtractOutputTokensSnapshotNilOnEmpty(t *testing.T) {
}
}

func TestExtractThinkingSnapshotByLevel(t *testing.T) {
// Three LLM calls with different thinking budgets:
// msg1 → 200 tokens (minimal)
// msg2 → 2000 tokens (light)
// msg3 → 15000 tokens (heavy)
makeEntry := func(msgID string, thinkingTokens int) TranscriptEntry {
return TranscriptEntry{
Type: "assistant",
Message: &Message{
ID: msgID,
Model: "claude-sonnet-4-6",
Usage: &Usage{OutputTokens: thinkingTokens},
Content: ContentSlice{
{Type: "thinking", Thinking: "..."},
},
},
}
}
entries := []TranscriptEntry{
makeEntry("msg1", 200),
makeEntry("msg2", 2000),
makeEntry("msg3", 15000),
}
parsed := ParseAssistantMessages(entries)
DeduplicateUsage(parsed)

snap := extractThinkingSnapshot(entries, parsed)
if snap == nil {
t.Fatal("expected non-nil snapshot")
}

summary := snap["summary"].(map[string]interface{})
if total := summary["total_tokens"].(int); total != 17200 {
t.Errorf("total_tokens = %d, want 17200", total)
}
if calls := summary["call_count"].(int); calls != 3 {
t.Errorf("call_count = %d, want 3", calls)
}

byLevel := snap["by_level"].([]map[string]interface{})
levels := map[string]map[string]interface{}{}
for _, l := range byLevel {
levels[l["level"].(string)] = l
}

if _, ok := levels["minimal"]; !ok {
t.Error("expected minimal level")
}
if _, ok := levels["light"]; !ok {
t.Error("expected light level")
}
if _, ok := levels["heavy"]; !ok {
t.Error("expected heavy level")
}
if _, ok := levels["medium"]; ok {
t.Error("unexpected medium level")
}

if levels["minimal"]["call_count"].(int) != 1 {
t.Errorf("minimal call_count = %d, want 1", levels["minimal"]["call_count"])
}
if levels["heavy"]["tokens"].(int) != 15000 {
t.Errorf("heavy tokens = %d, want 15000", levels["heavy"]["tokens"])
}
}

func TestExtractFileAttachmentsSnapshotByType(t *testing.T) {
makeAttachment := func(path, content string) TranscriptEntry {
raw, _ := json.Marshal(map[string]interface{}{
"file": map[string]string{"path": path, "content": content},
})
return TranscriptEntry{
Type: "attachment",
Attachment: &Attachment{
Type: "file",
Content: raw,
},
}
}
entries := []TranscriptEntry{
makeAttachment("/repo/main.go", "package main\nfunc main() {}"),
makeAttachment("/repo/util.go", "package main"),
makeAttachment("/repo/README.md", "# Hello"),
makeAttachment("/repo/Makefile", "build:"), // no extension → "other"
}

snap := extractFileAttachmentsSnapshot(entries)
if snap == nil {
t.Fatal("expected non-nil snapshot")
}

summary := snap["summary"].(map[string]interface{})
if fc := summary["file_count"].(int); fc != 4 {
t.Errorf("file_count = %d, want 4", fc)
}

byType := snap["by_type"].([]map[string]interface{})
exts := map[string]map[string]interface{}{}
for _, row := range byType {
exts[row["ext"].(string)] = row
}

if _, ok := exts[".go"]; !ok {
t.Error("expected .go entry")
}
if _, ok := exts[".md"]; !ok {
t.Error("expected .md entry")
}
if _, ok := exts["other"]; !ok {
t.Error("expected other entry for Makefile")
}
if exts[".go"]["file_count"].(int) != 2 {
t.Errorf(".go file_count = %d, want 2", exts[".go"]["file_count"])
}
}

func TestExtractAgentsSnapshotPrefersFrontmatterName(t *testing.T) {
home := t.TempDir()
cwd := t.TempDir()
Expand Down
Loading