why do I get charged when I am already using cache as 1 hour for my system prompt. Ideally I should be only charged for the first time.
fun main() {
val client = AnthropicOkHttpClient.fromEnv()
println("--- Variant B: cache_control ttl=1h on a large system block, web_search ---")
runProbe(
client,
withCacheControl = true,
"What are the latest developments in AI infrastructure stocks this week? Search the web and summarise in 5 bullets.",
)
println()
println("--- Variant B: cache_control ttl=1h on a large system block, web_search ---")
runProbe(client, withCacheControl = true, "Stocks effected in iran war, Search the web and summarise in 5 bullets.")
println()
println("--- Variant B: cache_control ttl=1h on a large system block, web_search ---")
runProbe(
client,
withCacheControl = true,
"What are the latest developments in AI infrastructure stocks this week? Search the web and summarise in 5 bullets.",
)
}
private fun runProbe(
client: AnthropicClient,
withCacheControl: Boolean,
message: String,
) {
val webSearch =
WebSearchTool20260209
.builder()
.maxUses(2)
.addAllowedCaller(WebSearchTool20260209.AllowedCaller.DIRECT)
.build()
val builder =
MessageCreateParams
.builder()
.model(Model.CLAUDE_HAIKU_4_5)
.maxTokens(1024)
.addTool(webSearch)
.addUserMessage(
message,
)
if (withCacheControl) {
val systemText =
"You are an investment research assistant. " + "Be precise, cite sources, and stay focused on the user's question. ".repeat(500)
val systemBlock =
TextBlockParam
.builder()
.text(systemText)
.cacheControl(
CacheControlEphemeral
.builder()
.ttl(CacheControlEphemeral.Ttl.TTL_1H)
.build(),
).build()
builder.systemOfTextBlockParams(listOf(systemBlock))
}
dump(client.messages().create(builder.build()).usage())
}
private fun dump(usage: Usage) {
val cacheCreation = usage.cacheCreation().orElse(null)
val ephemeral5m = cacheCreation?.ephemeral5mInputTokens() ?: 0L
val ephemeral1h = cacheCreation?.ephemeral1hInputTokens() ?: 0L
val cacheCreateTotal = usage.cacheCreationInputTokens().orElse(0L)
val phantom = cacheCreateTotal - ephemeral5m - ephemeral1h
val webSearchRequests = usage.serverToolUse().map { it.webSearchRequests() }.orElse(0L)
println("input = ${usage.inputTokens()}")
println("output = ${usage.outputTokens()}")
println("cache_read = ${usage.cacheReadInputTokens().orElse(0L)}")
println("cache_create_total = $cacheCreateTotal")
println("ephemeral_5m = $ephemeral5m")
println("ephemeral_1h = $ephemeral1h")
println("phantom = $phantom")
println("web_search_requests = $webSearchRequests")
}
Ideally 1st and 3rd prompts are exactly same, however I am charged for the 3rd prompt again for cache creation tokens. Please clarify on this behaviour. The output is shown below.
input = 354
output = 538
cache_read = 18798
cache_create_total = 11893
ephemeral_5m = 11893
ephemeral_1h = 0
phantom = 0
web_search_requests = 1
--- Variant B: cache_control ttl=1h on a large system block, web_search ---
input = 350
output = 578
cache_read = 18798
cache_create_total = 11504
ephemeral_5m = 11504
ephemeral_1h = 0
phantom = 0
web_search_requests = 1
--- Variant B: cache_control ttl=1h on a large system block, web_search ---
input = 354
output = 520
cache_read = 18798
cache_create_total = 12464
ephemeral_5m = 12464
ephemeral_1h = 0
phantom = 0
web_search_requests = 1```
why do I get charged when I am already using cache as 1 hour for my system prompt. Ideally I should be only charged for the first time.
Ideally 1st and 3rd prompts are exactly same, however I am charged for the 3rd prompt again for cache creation tokens. Please clarify on this behaviour. The output is shown below.