Skip to content

Charged for cache creation in web search on each turn for same system prompt #355

Description

@nikhil-raj-syfe

why do I get charged when I am already using cache as 1 hour for my system prompt. Ideally I should be only charged for the first time.

fun main() {
    val client = AnthropicOkHttpClient.fromEnv()
  
    println("--- Variant B: cache_control ttl=1h on a large system block, web_search ---")
    runProbe(
        client,
        withCacheControl = true,
        "What are the latest developments in AI infrastructure stocks this week? Search the web and summarise in 5 bullets.",
    )

    println()
    println("--- Variant B: cache_control ttl=1h on a large system block, web_search ---")
    runProbe(client, withCacheControl = true, "Stocks effected in iran war, Search the web and summarise in 5 bullets.")

    println()
    println("--- Variant B: cache_control ttl=1h on a large system block, web_search ---")
    runProbe(
        client,
        withCacheControl = true,
        "What are the latest developments in AI infrastructure stocks this week? Search the web and summarise in 5 bullets.",
    )
}

private fun runProbe(
    client: AnthropicClient,
    withCacheControl: Boolean,
    message: String,
) {
    val webSearch =
        WebSearchTool20260209
            .builder()
            .maxUses(2)
            .addAllowedCaller(WebSearchTool20260209.AllowedCaller.DIRECT)
            .build()

    val builder =
        MessageCreateParams
            .builder()
            .model(Model.CLAUDE_HAIKU_4_5)
            .maxTokens(1024)
            .addTool(webSearch)
            .addUserMessage(
                message,
            )

    if (withCacheControl) {
        val systemText =
            "You are an investment research assistant. " + "Be precise, cite sources, and stay focused on the user's question. ".repeat(500)
        val systemBlock =
            TextBlockParam
                .builder()
                .text(systemText)
                .cacheControl(
                    CacheControlEphemeral
                        .builder()
                        .ttl(CacheControlEphemeral.Ttl.TTL_1H)
                        .build(),
                ).build()
        builder.systemOfTextBlockParams(listOf(systemBlock))
    }

    dump(client.messages().create(builder.build()).usage())
}

private fun dump(usage: Usage) {
    val cacheCreation = usage.cacheCreation().orElse(null)
    val ephemeral5m = cacheCreation?.ephemeral5mInputTokens() ?: 0L
    val ephemeral1h = cacheCreation?.ephemeral1hInputTokens() ?: 0L
    val cacheCreateTotal = usage.cacheCreationInputTokens().orElse(0L)
    val phantom = cacheCreateTotal - ephemeral5m - ephemeral1h
    val webSearchRequests = usage.serverToolUse().map { it.webSearchRequests() }.orElse(0L)
    println("input              = ${usage.inputTokens()}")
    println("output             = ${usage.outputTokens()}")
    println("cache_read         = ${usage.cacheReadInputTokens().orElse(0L)}")
    println("cache_create_total = $cacheCreateTotal")
    println("ephemeral_5m       = $ephemeral5m")
    println("ephemeral_1h       = $ephemeral1h")
    println("phantom            = $phantom")
    println("web_search_requests = $webSearchRequests")
}

Ideally 1st and 3rd prompts are exactly same, however I am charged for the 3rd prompt again for cache creation tokens. Please clarify on this behaviour. The output is shown below.

input              = 354
output             = 538
cache_read         = 18798
cache_create_total = 11893
ephemeral_5m       = 11893
ephemeral_1h       = 0
phantom            = 0
web_search_requests = 1

--- Variant B: cache_control ttl=1h on a large system block, web_search ---
input              = 350
output             = 578
cache_read         = 18798
cache_create_total = 11504
ephemeral_5m       = 11504
ephemeral_1h       = 0
phantom            = 0
web_search_requests = 1

--- Variant B: cache_control ttl=1h on a large system block, web_search ---
input              = 354
output             = 520
cache_read         = 18798
cache_create_total = 12464
ephemeral_5m       = 12464
ephemeral_1h       = 0
phantom            = 0
web_search_requests = 1```

Metadata

Metadata

Assignees

No one assigned

    Labels

    apiquestionFurther information is requested

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions