From 69fed98ffa732b9a151f690e27a88b5d3d3991a2 Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Tue, 28 Apr 2026 00:09:39 +0800 Subject: [PATCH 01/38] feat(ai-cache): add plugin skeleton with schema definition --- apisix/plugins/ai-cache.lua | 62 ++++++++++ apisix/plugins/ai-cache/schema.lua | 138 +++++++++++++++++++++ conf/config.yaml.example | 1 + t/plugin/ai-cache.t | 192 +++++++++++++++++++++++++++++ 4 files changed, 393 insertions(+) create mode 100644 apisix/plugins/ai-cache.lua create mode 100644 apisix/plugins/ai-cache/schema.lua create mode 100644 t/plugin/ai-cache.t diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua new file mode 100644 index 000000000000..4c2d272b7c49 --- /dev/null +++ b/apisix/plugins/ai-cache.lua @@ -0,0 +1,62 @@ +-- +-- Licensed to the Apache Software Foundation (ASF) under one or more +-- contributor license agreements. See the NOTICE file distributed with +-- this work for additional information regarding copyright ownership. +-- The ASF licenses this file to You under the Apache License, Version 2.0 +-- (the "License"); you may not use this file except in compliance with +-- the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. +-- + +local core = require("apisix.core") +local schema = require("apisix.plugins.ai-cache.schema") + +local plugin_name = "ai-cache" + +local _M = { + version = 0.1, + priority = 1065, + name = plugin_name, + schema = schema.schema +} + +function _M.check_schema(conf) + local ok, err = core.schema.check(schema.schema, conf) + if not ok then + return false, err + end + + local layers = conf.layers or { "exact", "semantic" } + for _, layer in ipairs(layers) do + if layer == "semantic" and not (conf.semantic and conf.semantic.embedding) then + return false, "semantic layer requires semantic.embedding to be configured" + end + end + + return true +end + + +function _M.access(conf, ctx) + -- Phase 0 stub: will implement L1/L2 cache lookup in Phase 1 +end + + +function _M.body_filter(conf, ctx) + -- Phase 0 stub: will accumulate response chunks in Phase 1 +end + + +function _M.log(conf, ctx) + -- Phase 0 stub: will write to cache on 2xx in Phase 1 +end + + +return _M \ No newline at end of file diff --git a/apisix/plugins/ai-cache/schema.lua b/apisix/plugins/ai-cache/schema.lua new file mode 100644 index 000000000000..60f5a4a901b4 --- /dev/null +++ b/apisix/plugins/ai-cache/schema.lua @@ -0,0 +1,138 @@ +-- +-- Licensed to the Apache Software Foundation (ASF) under one or more +-- contributor license agreements. See the NOTICE file distributed with +-- this work for additional information regarding copyright ownership. +-- The ASF licenses this file to You under the Apache License, Version 2.0 +-- (the "License"); you may not use this file except in compliance with +-- the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. +-- + +local _M = {} + +local embedding_schema = { + type = "object", + properties = { + provider = { + type = "string", + enum = { "openai", "azure_openai" }, + }, + model = { type = "string" }, + endpoint = { type = "string" }, + api_key = { type = "string" }, + }, + required = { "provider", "endpoint", "api_key" }, +} + +local semantic_schema = { + type = "object", + properties = { + similarity_threshold = { + type = "number", + minimum = 0, + maximum = 1, + default = 0.95, + }, + top_k = { + type = "integer", + minimum = 1, + default = 1, + }, + ttl = { + type = "integer", + minimum = 1, + default = 86400, + }, + embedding = embedding_schema, + }, + required = { "embedding" }, +} + +local exact_schema = { + type = "object", + properties = { + ttl = { + type = "integer", + minimum = 1, + default = 3600, + }, + }, +} + +local redis_schema = { + type = "object", + properties = { + host = { type = "string", default = "127.0.0.1" }, + port = { type = "integer", minimum = 1, maximum = 65535, default = 6379 }, + password = { type = "string", default = "" }, + database = { type = "integer", minimum = 0, default = 0 }, + timeout = { type = "integer", minimum = 1, default = 1000 }, + ssl = { type = "boolean", default = false }, + keepalive_timeout = { type = "integer", minimum = 1, default = 60000 }, + keepalive_pool = { type = "integer", minimum = 1, default = 5 }, + }, +} + +local bypass_item_schema = { + type = "object", + properties = { + header = { type = "string" }, + equals = { type = "string" }, + }, + required = { "header", "equals" }, +} + +local headers_schema = { + type = "object", + properties = { + cache_status = { type = "string", default = "X-AI-Cache-Status" }, + cache_similarity = { type = "string", default = "X-AI-Cache-Similarity" }, + cache_age = { type = "string", default = "X-AI-Cache-Age" }, + }, +} + +_M.schema = { + type = "object", + properties = { + layers = { + type = "array", + items = { type = "string", enum = { "exact", "semantic" } }, + uniqueItems = true, + default = { "exact", "semantic" }, + }, + cache_key = { + type = "object", + properties = { + include_consumer = {type = "boolean", default = false }, + include_vars = { + type = "array", + items = { type = "string" }, + default = {}, + }, + }, + }, + exact = exact_schema, + semantic = semantic_schema, + redis = redis_schema, + bypass_on = { + type = "array", + items = bypass_item_schema, + }, + headers = headers_schema, + max_cache_body_size = { + type = "integer", + minimum = 1, + default = 1048576, + }, + }, + encrypt_fields = { "semantic.embedding.api_key", "redis.password" }, +} + +return _M \ No newline at end of file diff --git a/conf/config.yaml.example b/conf/config.yaml.example index ae7155a86b06..901774540d70 100644 --- a/conf/config.yaml.example +++ b/conf/config.yaml.example @@ -514,6 +514,7 @@ plugins: # plugin list (sorted by priority) - ai-prompt-template # priority: 1071 - ai-prompt-decorator # priority: 1070 - ai-prompt-guard # priority: 1072 + - ai-cache # priority: 1065 - ai-rag # priority: 1060 - ai-aws-content-moderation # priority: 1050 - ai-proxy-multi # priority: 1041 diff --git a/t/plugin/ai-cache.t b/t/plugin/ai-cache.t new file mode 100644 index 000000000000..fdeee9cd33d9 --- /dev/null +++ b/t/plugin/ai-cache.t @@ -0,0 +1,192 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +BEGIN { + $ENV{TEST_ENABLE_CONTROL_API_V1} = "0"; +} + +use t::APISIX 'no_plan'; + +log_level("info"); +repeat_each(1); +no_long_string(); +no_root_location(); + +add_block_preprocessor(sub { + my ($block) = @_; + + if (!defined $block->request) { + $block->set_value("request", "GET /t"); + } +}); + +run_tests(); + +__DATA__ + +=== TEST 1: valid config - exact layer only +--- config + location /t { + content_by_lua_block { + local plugin = require("apisix.plugins.ai-cache") + local ok, err = plugin.check_schema({ + layers = { "exact" }, + exact = { ttl = 600 }, + redis = { + host = "127.0.0.1", + port = 6379, + } + }) + + if not ok then + ngx.say("failed") + else + ngx.say("passed") + end + } + } +--- response_body +passed + + + +=== TEST 2: valid config - both layers with semantic embedding +--- config + location /t { + content_by_lua_block { + local plugin = require("apisix.plugins.ai-cache") + local ok, err = plugin.check_schema({ + layers = { "exact", "semantic" }, + exact = { ttl = 3600 }, + semantic = { + similarity_threshold = 0.95, + ttl = 86400, + embedding = { + provider = "openai", + endpoint = "https://api.openai.com/v1/embeddings", + api_key = "sk-test", + }, + }, + redis = { host = "127.0.0.1", port = 6379 }, + }) + + if not ok then + ngx.say(err) + else + ngx.say("passed") + end + } + } +--- response_body +passed + + + +=== TEST 3: semantic without embedding config - should fail +--- config + location /t { + content_by_lua_block { + local plugin = require("apisix.plugins.ai-cache") + local ok, err = plugin.check_schema({ + layers = { "semantic" }, + }) + if not ok then + ngx.say("failed: ", err) + else + ngx.say("passed") + end + } + } +--- response_body +failed: semantic layer requires semantic.embedding to be configured + + + +=== TEST 4: invalid layer value - should fail +--- config + location /t { + content_by_lua_block { + local plugin = require("apisix.plugins.ai-cache") + local ok, err = plugin.check_schema({ + layers = { "invalid_layer" }, + }) + if not ok then + ngx.say("failed") + else + ngx.say("passed") + end + } + } +--- response_body +failed + + + +=== TEST 5: unsupported embedding provider - should fail +--- config + location /t { + content_by_lua_block { + local plugin = require("apisix.plugins.ai-cache") + local ok, err = plugin.check_schema({ + layers = { "semantic" }, + semantic = { + embedding = { + provider = "some-unknown-provider", + endpoint = "https://example.com/embeddings", + api_key = "key", + }, + }, + }) + + if not ok then + ngx.say("failed") + else + ngx.say("passed") + end + } + } +--- response_body +failed + + + +=== TEST 6: similarity_threshold out of range - should fail +--- config + location /t { + content_by_lua_block { + local plugin = require("apisix.plugins.ai-cache") + local ok, err = plugin.check_schema({ + layers = { "semantic" }, + semantic = { + similarity_threshold = 1.5, + embedding = { + provider = "openai", + endpoint = "https://api.openai.com/v1/embeddings", + api_key = "sk-test", + }, + }, + }) + + if not ok then + ngx.say("failed") + else + ngx.say("passed") + end + } + } +--- response_body +failed From aea4028dedaadaa730fd112f54f7d4ad1cefd0fe Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Tue, 28 Apr 2026 05:19:34 +0800 Subject: [PATCH 02/38] feat(ai-cache): implement L1 exact cache with hash-based Redis lookup --- apisix/plugins/ai-cache.lua | 138 ++++++++++++++++++++- apisix/plugins/ai-cache/exact.lua | 138 +++++++++++++++++++++ apisix/plugins/ai-cache/schema.lua | 19 +-- t/plugin/ai-cache.t | 193 ++++++++++++++++++++++++++++- 4 files changed, 463 insertions(+), 25 deletions(-) create mode 100644 apisix/plugins/ai-cache/exact.lua diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua index 4c2d272b7c49..071763f27828 100644 --- a/apisix/plugins/ai-cache.lua +++ b/apisix/plugins/ai-cache.lua @@ -15,8 +15,13 @@ -- limitations under the License. -- -local core = require("apisix.core") -local schema = require("apisix.plugins.ai-cache.schema") +local core = require("apisix.core") +local schema = require("apisix.plugins.ai-cache.schema") +local exact = require("apisix.plugins.ai-cache.exact") + +local table_concat = table.concat +local ngx_time = ngx.time +local tostring = tostring local plugin_name = "ai-cache" @@ -45,17 +50,140 @@ end function _M.access(conf, ctx) - -- Phase 0 stub: will implement L1/L2 cache lookup in Phase 1 + -- Check bypass_on conditions + if conf.bypass_on then + local req_headers = ngx.req.get_headers() + for _, rule in ipairs(conf.bypass_on) do + if req_headers[rule.header] == rule.equals then + ctx.ai_cache_bypass = true + ctx.ai_cache_status = "BYPASS" + return + end + end + end + + -- Read and parse request body + local body_tab, err = core.request.get_json_request_body_table() + if not body_tab then + core.log.warn("ai-cache: failed to read request body: ", err or "unknown error") + ctx.ai_cache_miss = true + ctx.ai_cache_status = "MISS" + return + end + + local messages = body_tab.messages + if not messages then + ctx.ai_cache_miss = true + ctx.ai_cache_status = "MISS" + return + end + + -- Compute cache key components + local scope_hash = exact.compute_scope_hash(conf, ctx) + local prompt_hash, err = exact.compute_prompt_hash(messages) + if not prompt_hash then + core.log.warn("ai-cache: failed to compute prompt hash: ", err) + ctx.ai_cache_miss = true + ctx.ai_cache_status = "MISS" + return + end + + -- L1 exact lookup + local layers = conf.layers or { "exact", "semantic" } + local exact_enabled = false + for _, l in ipairs(layers) do + if l == "exact" then + exact_enabled = true + break + end + end + + if exact_enabled then + local cached_body, written_at, lookup_err = exact.get(conf, scope_hash, prompt_hash) + if lookup_err then + core.log.warn("ai-cache: L1 lookup error: ", lookup_err) + elseif cached_body then + core.log.info("ai-cache: L1 hit for key ", prompt_hash) + ctx.ai_cache_status = "HIT-L1" + ctx.ai_cache_written_at = written_at + return core.response.exit(200, cached_body) + end + end + + -- MISS - store context for body_filter and log phases + ctx.ai_cache_miss = true + ctx.ai_cache_status = "MISS" + ctx.ai_cache_scope_hash = scope_hash + ctx.ai_cache_prompt_hash = prompt_hash +end + + +function _M.header_filter(conf, ctx) + if not ctx.ai_cache_status then + return + end + + local status_header = (conf.headers and conf.headers.cache_status) + or "X-AI-Cache-Status" + ngx.header[status_header] = ctx.ai_cache_status + + if ctx.ai_cache_status == "HIT-L1" and ctx.ai_cache_written_at then + local age_header = (conf.headers and conf.headers.cache_age) + or "X-AI-Cache-Age" + ngx.header[age_header] = tostring(ngx_time() - ctx.ai_cache_written_at) + end end function _M.body_filter(conf, ctx) - -- Phase 0 stub: will accumulate response chunks in Phase 1 + if not ctx.ai_cache_miss then + return + end + + local chunk = ngx.arg[1] + + if type(chunk) == "string" and chunk ~= "" then + if not ctx.ai_cache_body_chunks then + ctx.ai_cache_body_chunks = {} + end + local chunks = ctx.ai_cache_body_chunks + chunks[#chunks + 1] = chunk + end end function _M.log(conf, ctx) - -- Phase 0 stub: will write to cache on 2xx in Phase 1 + if not ctx.ai_cache_miss or ctx.ai_cache_bypass then + return + end + + local status = core.response.get_upstream_status(ctx) or ngx.status + if not status or status < 200 or status >= 300 then + return + end + + if not ctx.ai_cache_body_chunks then + return + end + + local body = table_concat(ctx.ai_cache_body_chunks) + local max_size = conf.max_cache_body_size or 1048576 + if #body > max_size then + core.log.warn("ai-cache: response body exceeds max_cache_body_size, skipping write") + return + end + + local ttl = (conf.exact and conf.exact.ttl) or 3600 + local scope_hash = ctx.ai_cache_scope_hash + local prompt_hash = ctx.ai_cache_prompt_hash + + ngx.timer.at(0, function(premature) + if premature then return end + local err = exact.set(conf, scope_hash, prompt_hash, body, ttl) + if err then + ngx.log(ngx.ERR, "ai-cache: failed to write L1 cache: ", err) + end + end) end diff --git a/apisix/plugins/ai-cache/exact.lua b/apisix/plugins/ai-cache/exact.lua new file mode 100644 index 000000000000..56d9d98f16ca --- /dev/null +++ b/apisix/plugins/ai-cache/exact.lua @@ -0,0 +1,138 @@ +-- +-- Licensed to the Apache Software Foundation (ASF) under one or more +-- contributor license agreements. See the NOTICE file distributed with +-- this work for additional information regarding copyright ownership. +-- The ASF licenses this file to You under the Apache License, Version 2.0 +-- (the "License"); you may not use this file except in compliance with +-- the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. +-- + +local core = require("apisix.core") +local redis = require("apisix.utils.redis") +local resty_sha256 = require("resty.sha256") +local to_hex = require("resty.string").to_hex + +local table_concat = table.concat +local table_sort = table.sort +local ngx_time = ngx.time +local tostring = tostring + +local KEY_PREFIX = "ai-cache:l1:" + +local _M = {} + + +local function sha256_hex(s) + local hash = resty_sha256:new() + hash:update(s) + return to_hex(hash:final()) +end + +_M.sha256_hex = sha256_hex + +function _M.compute_scope_hash(conf, ctx) + local cache_key = conf.cache_key + if not cache_key then + return "" + end + + local parts = {} + local n = 0 + + if cache_key.include_consumer then + n = n + 1 + parts[n] = ctx.consumer_name or "" + end + + if cache_key.include_vars then + for _, var_name in ipairs(cache_key.include_vars) do + local key = var_name + if key:sub(1, 1) == "$" then + key = key:sub(2) + end + n = n + 1 + parts[n] = tostring(ctx.var[key] or "") + end + end + + if n == 0 then + return "" + end + + table_sort(parts) + return sha256_hex(table_concat(parts, "|")) +end + + +function _M.compute_prompt_hash(messages) + local encoded, err = core.json.encode(messages) + if not encoded then + return nil, err + end + return sha256_hex(encoded), nil +end + + +function _M.get(conf, scope_hash, prompt_hash) + local red, err = redis.new(conf) + if not red then + return nil, nil, err + end + + local key = KEY_PREFIX .. scope_hash .. ":" .. prompt_hash + local res, err = red:get(key) + red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool) + + if err then + return nil, nil, err + end + + if res == ngx.null then + return nil, nil, nil + end + + local entry, decode_err = core.json.decode(res) + if not entry then + return nil, nil, "corrupt cache entry: " .. decode_err + end + + return entry.body, entry.written_at, nil +end + + +function _M.set(conf, scope_hash, prompt_hash, body, ttl) + local red, err = redis.new(conf) + if not red then + return err + end + + local key = KEY_PREFIX .. scope_hash .. ":" .. prompt_hash + local entry, encode_err = core.json.encode({ + body = body, + written_at = ngx_time(), + }) + + if not entry then + red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool) + return encode_err + end + + local ok, err = red:set(key, entry, "EX", ttl) + red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool) + + if not ok then + return err + end + return nil +end + + +return _M diff --git a/apisix/plugins/ai-cache/schema.lua b/apisix/plugins/ai-cache/schema.lua index 60f5a4a901b4..76878d4c028b 100644 --- a/apisix/plugins/ai-cache/schema.lua +++ b/apisix/plugins/ai-cache/schema.lua @@ -15,6 +15,8 @@ -- limitations under the License. -- +local redis_schema = require("apisix.utils.redis-schema") + local _M = {} local embedding_schema = { @@ -66,19 +68,6 @@ local exact_schema = { }, } -local redis_schema = { - type = "object", - properties = { - host = { type = "string", default = "127.0.0.1" }, - port = { type = "integer", minimum = 1, maximum = 65535, default = 6379 }, - password = { type = "string", default = "" }, - database = { type = "integer", minimum = 0, default = 0 }, - timeout = { type = "integer", minimum = 1, default = 1000 }, - ssl = { type = "boolean", default = false }, - keepalive_timeout = { type = "integer", minimum = 1, default = 60000 }, - keepalive_pool = { type = "integer", minimum = 1, default = 5 }, - }, -} local bypass_item_schema = { type = "object", @@ -120,7 +109,6 @@ _M.schema = { }, exact = exact_schema, semantic = semantic_schema, - redis = redis_schema, bypass_on = { type = "array", items = bypass_item_schema, @@ -132,7 +120,8 @@ _M.schema = { default = 1048576, }, }, - encrypt_fields = { "semantic.embedding.api_key", "redis.password" }, + allOf = { redis_schema.schema.redis }, + encrypt_fields = { "semantic.embedding.api_key", "redis_password" }, } return _M \ No newline at end of file diff --git a/t/plugin/ai-cache.t b/t/plugin/ai-cache.t index fdeee9cd33d9..56fa2ee9e3b0 100644 --- a/t/plugin/ai-cache.t +++ b/t/plugin/ai-cache.t @@ -24,6 +24,7 @@ use t::APISIX 'no_plan'; log_level("info"); repeat_each(1); no_long_string(); +no_shuffle(); no_root_location(); add_block_preprocessor(sub { @@ -32,6 +33,36 @@ add_block_preprocessor(sub { if (!defined $block->request) { $block->set_value("request", "GET /t"); } + + if (!$block->error_log && !$block->no_error_log) { + $block->set_value("no_error_log", "[error]\n[alert]"); + } + + if (!defined $block->http_config) { + $block->set_value("http_config", <<_EOC_); +server { + server_name llm; + listen 1990; + default_type 'application/json'; + + location / { + content_by_lua_block { + ngx.status = 200 + ngx.header["Content-Type"] = "application/json" + ngx.say('{"id":"chatcmpl-test","choices":[{"message":{"role":"assistant","content":"The answer is 42"}}]}') + } + } + + location /error { + content_by_lua_block { + ngx.status = 400 + ngx.header["Content-Type"] = "application/json" + ngx.say('{"error":{"message":"bad request","type":"invalid_request_error"}}') + } + } +} +_EOC_ + } }); run_tests(); @@ -46,10 +77,8 @@ __DATA__ local ok, err = plugin.check_schema({ layers = { "exact" }, exact = { ttl = 600 }, - redis = { - host = "127.0.0.1", - port = 6379, - } + redis_host = "127.0.0.1", + redis_port = 6379, }) if not ok then @@ -81,7 +110,8 @@ passed api_key = "sk-test", }, }, - redis = { host = "127.0.0.1", port = 6379 }, + redis_host = "127.0.0.1", + redis_port = 6379, }) if not ok then @@ -103,6 +133,7 @@ passed local plugin = require("apisix.plugins.ai-cache") local ok, err = plugin.check_schema({ layers = { "semantic" }, + redis_host = "127.0.0.1", }) if not ok then ngx.say("failed: ", err) @@ -190,3 +221,155 @@ failed } --- response_body failed + + + +=== TEST 7: set up route for L1 cache tests +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + [[{ + "uri": "/chat", + "plugins": { + "ai-cache": { + "layers": ["exact"], + "exact": { "ttl": 60 }, + "redis_host": "127.0.0.1", + "bypass_on": [{"header": "X-Cache-Bypass", "equals": "1"}] + } + }, + "upstream": { + "type": "roundrobin", + "nodes": { + "127.0.0.1:1990": 1 + } + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- response_body +passed + + + +=== TEST 8: first request - cache MISS, upstream called +--- request +POST /chat +{"messages":[{"role":"user","content":"What is the answer to life?"}]} +--- more_headers +Content-Type: application/json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS +--- response_body +{"id":"chatcmpl-test","choices":[{"message":{"role":"assistant","content":"The answer is 42"}}]} + + + +=== TEST 9: second identical request - cache HIT-L1, no upstream call +--- request +POST /chat +{"messages":[{"role":"user","content":"What is the answer to life?"}]} +--- more_headers +Content-Type: application/json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: HIT-L1 +--- response_body +{"id":"chatcmpl-test","choices":[{"message":{"role":"assistant","content":"The answer is 42"}}]} +--- error_log +ai-cache: L1 hit for key + + + +=== TEST 10: bypass header - BYPASS, upstream called, not cached +--- request +POST /chat +{"messages":[{"role":"user","content":"What is the bypass question?"}]} +--- more_headers +Content-Type: application/json +X-Cache-Bypass: 1 +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: BYPASS + + + +=== TEST 11: same prompt without bypass after bypass - still MISS (bypass did not cache) +--- request +POST /chat +{"messages":[{"role":"user","content":"What is the bypass question?"}]} +--- more_headers +Content-Type: application/json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS + + + +=== TEST 12: set up route for 4xx test +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/2', + ngx.HTTP_PUT, + [[{ + "uri": "/error", + "plugins": { + "ai-cache": { + "layers": ["exact"], + "exact": { "ttl": 60 }, + "redis_host": "127.0.0.1" + } + }, + "upstream": { + "type": "roundrobin", + "nodes": { + "127.0.0.1:1990": 1 + } + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- response_body +passed + + + +=== TEST 13: 4xx from upstream - not cached +--- request +POST /error +{"messages":[{"role":"user","content":"trigger an error please"}]} +--- more_headers +Content-Type: application/json +--- error_code: 400 +--- response_headers +X-AI-Cache-Status: MISS + + + +=== TEST 14: same prompt after 4xx - still MISS (4xx was not cached) +--- request +POST /error +{"messages":[{"role":"user","content":"trigger an error please"}]} +--- more_headers +Content-Type: application/json +--- error_code: 400 +--- response_headers +X-AI-Cache-Status: MISS From f323e04277b150907650e817f9761d3e13ec0cd9 Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Tue, 28 Apr 2026 06:12:40 +0800 Subject: [PATCH 03/38] feat(ai-cache): add embedding drivers for OpenAI and Azure OpenAI --- apisix/plugins/ai-cache.lua | 3 +- .../ai-cache/embeddings/azure_openai.lua | 68 +++++++ apisix/plugins/ai-cache/embeddings/openai.lua | 71 +++++++ t/plugin/ai-cache.t | 191 ++++++++++++++++++ 4 files changed, 332 insertions(+), 1 deletion(-) create mode 100644 apisix/plugins/ai-cache/embeddings/azure_openai.lua create mode 100644 apisix/plugins/ai-cache/embeddings/openai.lua diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua index 071763f27828..d4e13ac84146 100644 --- a/apisix/plugins/ai-cache.lua +++ b/apisix/plugins/ai-cache.lua @@ -18,7 +18,6 @@ local core = require("apisix.core") local schema = require("apisix.plugins.ai-cache.schema") local exact = require("apisix.plugins.ai-cache.exact") - local table_concat = table.concat local ngx_time = ngx.time local tostring = tostring @@ -45,6 +44,8 @@ function _M.check_schema(conf) end end + core.utils.check_https({ "semantic.embedding.endpoint" }, conf, plugin_name) + return true end diff --git a/apisix/plugins/ai-cache/embeddings/azure_openai.lua b/apisix/plugins/ai-cache/embeddings/azure_openai.lua new file mode 100644 index 000000000000..1ce13ceefeab --- /dev/null +++ b/apisix/plugins/ai-cache/embeddings/azure_openai.lua @@ -0,0 +1,68 @@ +-- +-- Licensed to the Apache Software Foundation (ASF) under one or more +-- contributor license agreements. See the NOTICE file distributed with +-- this work for additional information regarding copyright ownership. +-- The ASF licenses this file to You under the Apache License, Version 2.0 +-- (the "License"); you may not use this file except in compliance with +-- the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. +-- + +local core = require("apisix.core") + +local HTTP_OK = ngx.HTTP_OK +local HTTP_INTERNAL_SERVER_ERROR = ngx.HTTP_INTERNAL_SERVER_ERROR + +local _M = {} + + +function _M.get_embeddings(conf, text, httpc, ssl_verify) + local body, err = core.json.encode({ input = text }) + if not body then + return nil, HTTP_INTERNAL_SERVER_ERROR, err + end + + local res, err = httpc:request_uri(conf.endpoint, { + method = "POST", + headers = { + ["Content-Type"] = "application/json", + ["api-key"] = conf.api_key, + }, + body = body, + ssl_verify = ssl_verify, + }) + + if not res or not res.body then + return nil, HTTP_INTERNAL_SERVER_ERROR, err or "no response from embeddings API" + end + + if res.status ~= HTTP_OK then + return nil, res.status, res.body + end + + local res_tab, err = core.json.decode(res.body) + if not res_tab then + return nil, HTTP_INTERNAL_SERVER_ERROR, err + end + + if type(res_tab.data) ~= "table" or core.table.isempty(res_tab.data) then + return nil, HTTP_INTERNAL_SERVER_ERROR, "unexpected embedding response: " .. res.body + end + + local embedding = res_tab.data[1].embedding + if type(embedding) ~= "table" then + return nil, HTTP_INTERNAL_SERVER_ERROR, "missing embedding field in response" + end + + return embedding, nil, nil +end + + +return _M diff --git a/apisix/plugins/ai-cache/embeddings/openai.lua b/apisix/plugins/ai-cache/embeddings/openai.lua new file mode 100644 index 000000000000..60f65d6a9777 --- /dev/null +++ b/apisix/plugins/ai-cache/embeddings/openai.lua @@ -0,0 +1,71 @@ +-- +-- Licensed to the Apache Software Foundation (ASF) under one or more +-- contributor license agreements. See the NOTICE file distributed with +-- this work for additional information regarding copyright ownership. +-- The ASF licenses this file to You under the Apache License, Version 2.0 +-- (the "License"); you may not use this file except in compliance with +-- the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. +-- + +local core = require("apisix.core") + +local HTTP_OK = ngx.HTTP_OK +local HTTP_INTERNAL_SERVER_ERROR = ngx.HTTP_INTERNAL_SERVER_ERROR + +local _M = {} + + +function _M.get_embeddings(conf, text, httpc, ssl_verify) + local body, err = core.json.encode({ + input = text, + model = conf.model or "text-embedding-3-small", + }) + if not body then + return nil, HTTP_INTERNAL_SERVER_ERROR, err + end + + local res, err = httpc:request_uri(conf.endpoint, { + method = "POST", + headers = { + ["Content-Type"] = "application/json", + ["Authorization"] = "Bearer " .. conf.api_key, + }, + body = body, + ssl_verify = ssl_verify, + }) + + if not res or not res.body then + return nil, HTTP_INTERNAL_SERVER_ERROR, err or "no response from embeddings API" + end + + if res.status ~= HTTP_OK then + return nil, res.status, res.body + end + + local res_tab, err = core.json.decode(res.body) + if not res_tab then + return nil, HTTP_INTERNAL_SERVER_ERROR, err + end + + if type(res_tab.data) ~= "table" or core.table.isempty(res_tab.data) then + return nil, HTTP_INTERNAL_SERVER_ERROR, "unexpected embedding response: " .. res.body + end + + local embedding = res_tab.data[1].embedding + if type(embedding) ~= "table" then + return nil, HTTP_INTERNAL_SERVER_ERROR, "missing embedding field in response" + end + + return embedding, nil, nil +end + + +return _M diff --git a/t/plugin/ai-cache.t b/t/plugin/ai-cache.t index 56fa2ee9e3b0..ae3a7e8ba3a3 100644 --- a/t/plugin/ai-cache.t +++ b/t/plugin/ai-cache.t @@ -373,3 +373,194 @@ Content-Type: application/json --- error_code: 400 --- response_headers X-AI-Cache-Status: MISS + + + +=== TEST 15: openai driver - parses embedding vector correctly +--- http_config +server { + listen 1991; + default_type 'application/json'; + + location /v1/embeddings { + content_by_lua_block { + local cjson = require("cjson.safe") + ngx.req.read_body() + local body = cjson.decode(ngx.req.get_body_data()) + + if ngx.req.get_headers()["Authorization"] ~= "Bearer test-key" then + ngx.status = 401 + ngx.say('{"error":"unauthorized"}') + return + end + + ngx.status = 200 + ngx.say(cjson.encode({ + data = { + { embedding = {0.1, 0.2, 0.3}, index = 0, object = "embedding" } + }, + model = body.model, + object = "list" + })) + } + } +} +--- config + location /t { + content_by_lua_block { + local http = require("resty.http") + local driver = require("apisix.plugins.ai-cache.embeddings.openai") + + local httpc = http.new() + local conf = { + endpoint = "http://127.0.0.1:1991/v1/embeddings", + api_key = "test-key", + model = "text-embedding-3-small", + } + + local embedding, status, err = driver.get_embeddings(conf, "hello world", httpc, false) + if not embedding then + ngx.say("error: ", err) + return + end + + if #embedding ~= 3 then + ngx.say("wrong length: ", #embedding) + return + end + + ngx.say("ok: ", embedding[1], " ", embedding[2], " ", embedding[3]) + } + } +--- response_body +ok: 0.1 0.2 0.3 + + + +=== TEST 16: openai driver - 429 from API return nil with status +--- http_config +server { + listen 1991; + default_type 'application/json'; + + location /v1/embeddings { + content_by_lua_block { + ngx.status = 429 + ngx.say('{"error":{"message":"rate limit exceeded","type":"requests"}}') + } + } +} +--- config + location /t { + content_by_lua_block { + local http = require("resty.http") + local driver = require("apisix.plugins.ai-cache.embeddings.openai") + + local httpc = http.new() + local conf = { + endpoint = "http://127.0.0.1:1991/v1/embeddings", + api_key = "test-key", + } + + local embedding, status, err = driver.get_embeddings(conf, "hello", httpc, false) + if embedding then + ngx.say("unexpected success") + return + end + + ngx.say("status: ", status) + } + } +--- response_body +status: 429 + + + +=== TEST 17: azure_openai driver - parses embedding vector correctly +--- http_config +server { + listen 1991; + default_type 'application/json'; + + location /embeddings { + content_by_lua_block { + local cjson = require("cjson.safe") + + if ngx.req.get_headers()["api-key"] ~= "azure-test-key" then + ngx.status = 401 + ngx.say('{"error":"unauthorized"}') + return + end + + ngx.status = 200 + ngx.say(cjson.encode({ + data = { + { embedding = {0.4, 0.5, 0.6}, index = 0, object = "embedding" } + }, + object = "list" + })) + } + } +} +--- config + location /t { + content_by_lua_block { + local http = require("resty.http") + local driver = require("apisix.plugins.ai-cache.embeddings.azure_openai") + + local httpc = http.new() + local conf = { + endpoint = "http://127.0.0.1:1991/embeddings", + api_key = "azure-test-key", + } + + local embedding, status, err = driver.get_embeddings(conf, "hello world", httpc, false) + if not embedding then + ngx.say("error: ", err) + return + end + + ngx.say("ok: ", embedding[1], " ", embedding[2], " ", embedding[3]) + } + } +--- response_body +ok: 0.4 0.5 0.6 + + + +=== TEST 18: openai driver - 500 from API returns nil with status +--- http_config +server { + listen 1991; + default_type 'application/json'; + + location /v1/embeddings { + content_by_lua_block { + ngx.status = 500 + ngx.say('{"error":{"message":"internal server error"}}') + } + } +} +--- config + location /t { + content_by_lua_block { + local http = require("resty.http") + local driver = require("apisix.plugins.ai-cache.embeddings.openai") + + local httpc = http.new() + local conf = { + endpoint = "http://127.0.0.1:1991/v1/embeddings", + api_key = "test-key", + } + + local embedding, status, err = driver.get_embeddings(conf, "hello", httpc, false) + if embedding then + ngx.say("unexpected success") + return + end + + ngx.say("status: ", status) + } + } +--- response_body +status: 500 From eafce29c93153ac10560fcb78d51e5458147f94e Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Tue, 28 Apr 2026 06:19:56 +0800 Subject: [PATCH 04/38] chore: fix lint errors --- apisix/plugins/ai-cache.lua | 18 +++++++++--------- .../ai-cache/embeddings/azure_openai.lua | 18 +++++++++--------- apisix/plugins/ai-cache/embeddings/openai.lua | 18 +++++++++--------- apisix/plugins/ai-cache/exact.lua | 18 +++++++++--------- apisix/plugins/ai-cache/schema.lua | 18 +++++++++--------- 5 files changed, 45 insertions(+), 45 deletions(-) diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua index d4e13ac84146..9cb4f7b53243 100644 --- a/apisix/plugins/ai-cache.lua +++ b/apisix/plugins/ai-cache.lua @@ -1,18 +1,18 @@ --- --- Licensed to the Apache Software Foundation (ASF) under one or more --- contributor license agreements. See the NOTICE file distributed with +-- +-- Licensed to the Apache Software Foundation (ASF) under one or more +-- contributor license agreements. See the NOTICE file distributed with -- this work for additional information regarding copyright ownership. --- The ASF licenses this file to You under the Apache License, Version 2.0 +-- The ASF licenses this file to You under the Apache License, Version 2.0 -- (the "License"); you may not use this file except in compliance with --- the License. You may obtain a copy of the License at +-- the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 -- --- http://www.apache.org/licenses/LICENSE-2.0 --- -- Unless required by applicable law or agreed to in writing, software -- distributed under the License is distributed on an "AS IS" BASIS, --- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -- See the License for the specific language governing permissions and --- limitations under the License. +-- limitations under the License. -- local core = require("apisix.core") diff --git a/apisix/plugins/ai-cache/embeddings/azure_openai.lua b/apisix/plugins/ai-cache/embeddings/azure_openai.lua index 1ce13ceefeab..a52c5e265497 100644 --- a/apisix/plugins/ai-cache/embeddings/azure_openai.lua +++ b/apisix/plugins/ai-cache/embeddings/azure_openai.lua @@ -1,18 +1,18 @@ --- --- Licensed to the Apache Software Foundation (ASF) under one or more --- contributor license agreements. See the NOTICE file distributed with +-- +-- Licensed to the Apache Software Foundation (ASF) under one or more +-- contributor license agreements. See the NOTICE file distributed with -- this work for additional information regarding copyright ownership. --- The ASF licenses this file to You under the Apache License, Version 2.0 +-- The ASF licenses this file to You under the Apache License, Version 2.0 -- (the "License"); you may not use this file except in compliance with --- the License. You may obtain a copy of the License at +-- the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 -- --- http://www.apache.org/licenses/LICENSE-2.0 --- -- Unless required by applicable law or agreed to in writing, software -- distributed under the License is distributed on an "AS IS" BASIS, --- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -- See the License for the specific language governing permissions and --- limitations under the License. +-- limitations under the License. -- local core = require("apisix.core") diff --git a/apisix/plugins/ai-cache/embeddings/openai.lua b/apisix/plugins/ai-cache/embeddings/openai.lua index 60f65d6a9777..ff50f2bbea27 100644 --- a/apisix/plugins/ai-cache/embeddings/openai.lua +++ b/apisix/plugins/ai-cache/embeddings/openai.lua @@ -1,18 +1,18 @@ --- --- Licensed to the Apache Software Foundation (ASF) under one or more --- contributor license agreements. See the NOTICE file distributed with +-- +-- Licensed to the Apache Software Foundation (ASF) under one or more +-- contributor license agreements. See the NOTICE file distributed with -- this work for additional information regarding copyright ownership. --- The ASF licenses this file to You under the Apache License, Version 2.0 +-- The ASF licenses this file to You under the Apache License, Version 2.0 -- (the "License"); you may not use this file except in compliance with --- the License. You may obtain a copy of the License at +-- the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 -- --- http://www.apache.org/licenses/LICENSE-2.0 --- -- Unless required by applicable law or agreed to in writing, software -- distributed under the License is distributed on an "AS IS" BASIS, --- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -- See the License for the specific language governing permissions and --- limitations under the License. +-- limitations under the License. -- local core = require("apisix.core") diff --git a/apisix/plugins/ai-cache/exact.lua b/apisix/plugins/ai-cache/exact.lua index 56d9d98f16ca..5b42ebcb413e 100644 --- a/apisix/plugins/ai-cache/exact.lua +++ b/apisix/plugins/ai-cache/exact.lua @@ -1,18 +1,18 @@ --- --- Licensed to the Apache Software Foundation (ASF) under one or more --- contributor license agreements. See the NOTICE file distributed with +-- +-- Licensed to the Apache Software Foundation (ASF) under one or more +-- contributor license agreements. See the NOTICE file distributed with -- this work for additional information regarding copyright ownership. --- The ASF licenses this file to You under the Apache License, Version 2.0 +-- The ASF licenses this file to You under the Apache License, Version 2.0 -- (the "License"); you may not use this file except in compliance with --- the License. You may obtain a copy of the License at +-- the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 -- --- http://www.apache.org/licenses/LICENSE-2.0 --- -- Unless required by applicable law or agreed to in writing, software -- distributed under the License is distributed on an "AS IS" BASIS, --- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -- See the License for the specific language governing permissions and --- limitations under the License. +-- limitations under the License. -- local core = require("apisix.core") diff --git a/apisix/plugins/ai-cache/schema.lua b/apisix/plugins/ai-cache/schema.lua index 76878d4c028b..444858cc9067 100644 --- a/apisix/plugins/ai-cache/schema.lua +++ b/apisix/plugins/ai-cache/schema.lua @@ -1,18 +1,18 @@ --- --- Licensed to the Apache Software Foundation (ASF) under one or more --- contributor license agreements. See the NOTICE file distributed with +-- +-- Licensed to the Apache Software Foundation (ASF) under one or more +-- contributor license agreements. See the NOTICE file distributed with -- this work for additional information regarding copyright ownership. --- The ASF licenses this file to You under the Apache License, Version 2.0 +-- The ASF licenses this file to You under the Apache License, Version 2.0 -- (the "License"); you may not use this file except in compliance with --- the License. You may obtain a copy of the License at +-- the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 -- --- http://www.apache.org/licenses/LICENSE-2.0 --- -- Unless required by applicable law or agreed to in writing, software -- distributed under the License is distributed on an "AS IS" BASIS, --- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -- See the License for the specific language governing permissions and --- limitations under the License. +-- limitations under the License. -- local redis_schema = require("apisix.utils.redis-schema") From 09e46927345a067e1cc6e6776016c9d8d9446932 Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Wed, 29 Apr 2026 00:13:42 +0800 Subject: [PATCH 05/38] refactor(ai-cache): use ai-protocols for protocol-agnostic caching --- apisix/plugins/ai-cache.lua | 86 ++++++++++++++----------------- apisix/plugins/ai-cache/exact.lua | 14 ++--- t/plugin/ai-cache.t | 75 ++++++++++++--------------- 3 files changed, 78 insertions(+), 97 deletions(-) diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua index 9cb4f7b53243..5c5e1cb53ff8 100644 --- a/apisix/plugins/ai-cache.lua +++ b/apisix/plugins/ai-cache.lua @@ -18,9 +18,10 @@ local core = require("apisix.core") local schema = require("apisix.plugins.ai-cache.schema") local exact = require("apisix.plugins.ai-cache.exact") -local table_concat = table.concat -local ngx_time = ngx.time -local tostring = tostring +local protocols = require("apisix.plugins.ai-protocols") +local ngx_time = ngx.time +local tostring = tostring +local table_concat = table.concat local plugin_name = "ai-cache" @@ -63,7 +64,6 @@ function _M.access(conf, ctx) end end - -- Read and parse request body local body_tab, err = core.request.get_json_request_body_table() if not body_tab then core.log.warn("ai-cache: failed to read request body: ", err or "unknown error") @@ -72,24 +72,32 @@ function _M.access(conf, ctx) return end - local messages = body_tab.messages - if not messages then + local protocol_name = protocols.detect(body_tab, ctx) + if not protocol_name then + core.log.warn("ai-cache: could not detect AI protocol, skipping cache") + ctx.ai_cache_miss = true + ctx.ai_cache_status = "MISS" + return + end + + local proto = protocols.get(protocol_name) + local contents = proto.extract_request_content(body_tab) + if not contents or #contents == 0 then ctx.ai_cache_miss = true ctx.ai_cache_status = "MISS" return end - -- Compute cache key components + local prompt_text = table_concat(contents, " ") local scope_hash = exact.compute_scope_hash(conf, ctx) - local prompt_hash, err = exact.compute_prompt_hash(messages) + local prompt_hash, hash_err = exact.compute_prompt_hash(prompt_text) if not prompt_hash then - core.log.warn("ai-cache: failed to compute prompt hash: ", err) + core.log.warn("ai-cache: failed to compute prompt hash: ", hash_err) ctx.ai_cache_miss = true ctx.ai_cache_status = "MISS" return end - -- L1 exact lookup local layers = conf.layers or { "exact", "semantic" } local exact_enabled = false for _, l in ipairs(layers) do @@ -100,22 +108,26 @@ function _M.access(conf, ctx) end if exact_enabled then - local cached_body, written_at, lookup_err = exact.get(conf, scope_hash, prompt_hash) + local cached_text, written_at, lookup_err = exact.get(conf, scope_hash, prompt_hash) if lookup_err then core.log.warn("ai-cache: L1 lookup error: ", lookup_err) - elseif cached_body then + elseif cached_text then core.log.info("ai-cache: L1 hit for key ", prompt_hash) - ctx.ai_cache_status = "HIT-L1" + ctx.ai_cache_status = "HIT-L1" ctx.ai_cache_written_at = written_at - return core.response.exit(200, cached_body) + local is_stream = body_tab.stream == true + return core.response.exit(200, proto.build_deny_response({ + stream = is_stream, + text = cached_text, + })) end end - -- MISS - store context for body_filter and log phases - ctx.ai_cache_miss = true - ctx.ai_cache_status = "MISS" + ctx.ai_cache_miss = true + ctx.ai_cache_status = "MISS" ctx.ai_cache_scope_hash = scope_hash ctx.ai_cache_prompt_hash = prompt_hash + ctx.ai_cache_prompt_text = prompt_text end @@ -136,23 +148,6 @@ function _M.header_filter(conf, ctx) end -function _M.body_filter(conf, ctx) - if not ctx.ai_cache_miss then - return - end - - local chunk = ngx.arg[1] - - if type(chunk) == "string" and chunk ~= "" then - if not ctx.ai_cache_body_chunks then - ctx.ai_cache_body_chunks = {} - end - local chunks = ctx.ai_cache_body_chunks - chunks[#chunks + 1] = chunk - end -end - - function _M.log(conf, ctx) if not ctx.ai_cache_miss or ctx.ai_cache_bypass then return @@ -163,24 +158,21 @@ function _M.log(conf, ctx) return end - if not ctx.ai_cache_body_chunks then + local response_text = ctx.var.llm_response_text + if not response_text or response_text == "" then return end - local body = table_concat(ctx.ai_cache_body_chunks) - local max_size = conf.max_cache_body_size or 1048576 - if #body > max_size then - core.log.warn("ai-cache: response body exceeds max_cache_body_size, skipping write") - return - end - - local ttl = (conf.exact and conf.exact.ttl) or 3600 - local scope_hash = ctx.ai_cache_scope_hash - local prompt_hash = ctx.ai_cache_prompt_hash + local ttl = (conf.exact and conf.exact.ttl) or 3600 + local scope_hash = ctx.ai_cache_scope_hash + local prompt_hash = ctx.ai_cache_prompt_hash ngx.timer.at(0, function(premature) - if premature then return end - local err = exact.set(conf, scope_hash, prompt_hash, body, ttl) + if premature then + return + end + + local err = exact.set(conf, scope_hash, prompt_hash, response_text, ttl) if err then ngx.log(ngx.ERR, "ai-cache: failed to write L1 cache: ", err) end diff --git a/apisix/plugins/ai-cache/exact.lua b/apisix/plugins/ai-cache/exact.lua index 5b42ebcb413e..4b4d36b157a6 100644 --- a/apisix/plugins/ai-cache/exact.lua +++ b/apisix/plugins/ai-cache/exact.lua @@ -72,12 +72,8 @@ function _M.compute_scope_hash(conf, ctx) end -function _M.compute_prompt_hash(messages) - local encoded, err = core.json.encode(messages) - if not encoded then - return nil, err - end - return sha256_hex(encoded), nil +function _M.compute_prompt_hash(text) + return sha256_hex(text), nil end @@ -104,11 +100,11 @@ function _M.get(conf, scope_hash, prompt_hash) return nil, nil, "corrupt cache entry: " .. decode_err end - return entry.body, entry.written_at, nil + return entry.text, entry.written_at, nil end -function _M.set(conf, scope_hash, prompt_hash, body, ttl) +function _M.set(conf, scope_hash, prompt_hash, text, ttl) local red, err = redis.new(conf) if not red then return err @@ -116,7 +112,7 @@ function _M.set(conf, scope_hash, prompt_hash, body, ttl) local key = KEY_PREFIX .. scope_hash .. ":" .. prompt_hash local entry, encode_err = core.json.encode({ - body = body, + text = text, written_at = ngx_time(), }) diff --git a/t/plugin/ai-cache.t b/t/plugin/ai-cache.t index ae3a7e8ba3a3..a375d50bd9ca 100644 --- a/t/plugin/ai-cache.t +++ b/t/plugin/ai-cache.t @@ -38,31 +38,6 @@ add_block_preprocessor(sub { $block->set_value("no_error_log", "[error]\n[alert]"); } - if (!defined $block->http_config) { - $block->set_value("http_config", <<_EOC_); -server { - server_name llm; - listen 1990; - default_type 'application/json'; - - location / { - content_by_lua_block { - ngx.status = 200 - ngx.header["Content-Type"] = "application/json" - ngx.say('{"id":"chatcmpl-test","choices":[{"message":{"role":"assistant","content":"The answer is 42"}}]}') - } - } - - location /error { - content_by_lua_block { - ngx.status = 400 - ngx.header["Content-Type"] = "application/json" - ngx.say('{"error":{"message":"bad request","type":"invalid_request_error"}}') - } - } -} -_EOC_ - } }); run_tests(); @@ -234,18 +209,23 @@ failed [[{ "uri": "/chat", "plugins": { + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer test-key" + } + }, + "override": { + "endpoint": "http://127.0.0.1:1980/v1/chat/completions" + } + }, "ai-cache": { "layers": ["exact"], "exact": { "ttl": 60 }, "redis_host": "127.0.0.1", "bypass_on": [{"header": "X-Cache-Bypass", "equals": "1"}] } - }, - "upstream": { - "type": "roundrobin", - "nodes": { - "127.0.0.1:1990": 1 - } } }]] ) @@ -267,11 +247,12 @@ POST /chat {"messages":[{"role":"user","content":"What is the answer to life?"}]} --- more_headers Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json --- error_code: 200 --- response_headers X-AI-Cache-Status: MISS ---- response_body -{"id":"chatcmpl-test","choices":[{"message":{"role":"assistant","content":"The answer is 42"}}]} +--- response_body_like eval +qr/content/ @@ -281,11 +262,12 @@ POST /chat {"messages":[{"role":"user","content":"What is the answer to life?"}]} --- more_headers Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json --- error_code: 200 --- response_headers X-AI-Cache-Status: HIT-L1 ---- response_body -{"id":"chatcmpl-test","choices":[{"message":{"role":"assistant","content":"The answer is 42"}}]} +--- response_body_like eval +qr/content/ --- error_log ai-cache: L1 hit for key @@ -297,6 +279,7 @@ POST /chat {"messages":[{"role":"user","content":"What is the bypass question?"}]} --- more_headers Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json X-Cache-Bypass: 1 --- error_code: 200 --- response_headers @@ -310,6 +293,7 @@ POST /chat {"messages":[{"role":"user","content":"What is the bypass question?"}]} --- more_headers Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json --- error_code: 200 --- response_headers X-AI-Cache-Status: MISS @@ -326,17 +310,22 @@ X-AI-Cache-Status: MISS [[{ "uri": "/error", "plugins": { + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer test-key" + } + }, + "override": { + "endpoint": "http://127.0.0.1:1980/v1/chat/completions" + } + }, "ai-cache": { "layers": ["exact"], "exact": { "ttl": 60 }, "redis_host": "127.0.0.1" } - }, - "upstream": { - "type": "roundrobin", - "nodes": { - "127.0.0.1:1990": 1 - } } }]] ) @@ -358,6 +347,8 @@ POST /error {"messages":[{"role":"user","content":"trigger an error please"}]} --- more_headers Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +X-AI-Fixture-Status: 400 --- error_code: 400 --- response_headers X-AI-Cache-Status: MISS @@ -370,6 +361,8 @@ POST /error {"messages":[{"role":"user","content":"trigger an error please"}]} --- more_headers Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +X-AI-Fixture-Status: 400 --- error_code: 400 --- response_headers X-AI-Cache-Status: MISS From a899f6ace764b922d54c882d18a6821b300b72eb Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Wed, 29 Apr 2026 09:18:10 +0800 Subject: [PATCH 06/38] feat(ai-cache): implement L2 semantic cache with Redis Stack KNN search --- apisix/plugins/ai-cache.lua | 122 +++++++++++++++--- apisix/plugins/ai-cache/semantic.lua | 173 +++++++++++++++++++++++++ t/plugin/ai-cache.t | 184 +++++++++++++++++++++++++-- 3 files changed, 452 insertions(+), 27 deletions(-) create mode 100644 apisix/plugins/ai-cache/semantic.lua diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua index 5c5e1cb53ff8..216291692ca9 100644 --- a/apisix/plugins/ai-cache.lua +++ b/apisix/plugins/ai-cache.lua @@ -18,7 +18,9 @@ local core = require("apisix.core") local schema = require("apisix.plugins.ai-cache.schema") local exact = require("apisix.plugins.ai-cache.exact") +local semantic = require("apisix.plugins.ai-cache.semantic") local protocols = require("apisix.plugins.ai-protocols") +local http = require("resty.http") local ngx_time = ngx.time local tostring = tostring local table_concat = table.concat @@ -32,15 +34,24 @@ local _M = { schema = schema.schema } + +local function layer_enabled(conf, name) + local layers = conf.layers or { "exact", "semantic" } + for _, l in ipairs(layers) do + if l == name then return true end + end + return false +end + + function _M.check_schema(conf) local ok, err = core.schema.check(schema.schema, conf) if not ok then return false, err end - local layers = conf.layers or { "exact", "semantic" } - for _, layer in ipairs(layers) do - if layer == "semantic" and not (conf.semantic and conf.semantic.embedding) then + if layer_enabled(conf, "semantic") then + if not (conf.semantic and conf.semantic.embedding) then return false, "semantic layer requires semantic.embedding to be configured" end end @@ -98,21 +109,13 @@ function _M.access(conf, ctx) return end - local layers = conf.layers or { "exact", "semantic" } - local exact_enabled = false - for _, l in ipairs(layers) do - if l == "exact" then - exact_enabled = true - break - end - end - - if exact_enabled then + -- L1 exact lookup + if layer_enabled(conf, "exact") then local cached_text, written_at, lookup_err = exact.get(conf, scope_hash, prompt_hash) if lookup_err then core.log.warn("ai-cache: L1 lookup error: ", lookup_err) elseif cached_text then - core.log.info("ai-cache: L1 hit for key ", prompt_hash) + core.log.info("ai-cache: L1 hit for key: ", prompt_hash) ctx.ai_cache_status = "HIT-L1" ctx.ai_cache_written_at = written_at local is_stream = body_tab.stream == true @@ -123,6 +126,46 @@ function _M.access(conf, ctx) end end + -- L2 semantic lookup + if layer_enabled(conf, "semantic") then + local emb_conf = conf.semantic.embedding + local emb_driver = require("apisix.plugins.ai-cache.embeddings." .. emb_conf.provider) + local httpc = http.new() + + local embedding, _, emb_err = emb_driver.get_embeddings(emb_conf, prompt_text, httpc, true) + if not embedding then + core.log.warn("ai-cache: embedding fetch failed (degrading to MISS): ", emb_err) + else + ctx.ai_cache_embedding = embedding + + local threshold = conf.semantic.similarity_threshold or 0.95 + local cached_text, similarity, search_err = semantic.search( + conf, scope_hash, embedding, threshold + ) + + if search_err then + core.log.warn("ai-cache: L2 search error (degrading to MISS): ", search_err) + elseif cached_text then + core.log.info("ai-cache: L2 hit, similarity=", similarity) + + local l1_ttl = (conf.exact and conf.exact.ttl) or 3600 + local l1_err = exact.set(conf, scope_hash, prompt_hash, cached_text, l1_ttl) + + if l1_err then + core.log.warn("ai-cache: L2->L1 backfill failed: ", l1_err) + end + + ctx.ai_cache_status = "HIT-L2" + ctx.ai_cache_similarity = similarity + local is_stream = body_tab.stream == true + return core.response.exit(200, proto.build_deny_response({ + stream = is_stream, + text = cached_text, + })) + end + end + end + ctx.ai_cache_miss = true ctx.ai_cache_status = "MISS" ctx.ai_cache_scope_hash = scope_hash @@ -145,6 +188,12 @@ function _M.header_filter(conf, ctx) or "X-AI-Cache-Age" ngx.header[age_header] = tostring(ngx_time() - ctx.ai_cache_written_at) end + + if ctx.ai_cache_status == "HIT-L2" and ctx.ai_cache_similarity then + local sim_header = (conf.headers and conf.headers.cache_similarity) + or "X-AI-Cache-Similarity" + ngx.header[sim_header] = tostring(ctx.ai_cache_similarity) + end end @@ -163,21 +212,56 @@ function _M.log(conf, ctx) return end - local ttl = (conf.exact and conf.exact.ttl) or 3600 + local exact_enabled = layer_enabled(conf, "exact") + local semantic_enabled = layer_enabled(conf, "semantic") + local ttl_exact = (conf.exact and conf.exact.ttl) or 3600 local scope_hash = ctx.ai_cache_scope_hash local prompt_hash = ctx.ai_cache_prompt_hash + local embedding = ctx.ai_cache_embedding + local prompt_text = ctx.ai_cache_prompt_text ngx.timer.at(0, function(premature) if premature then return end - local err = exact.set(conf, scope_hash, prompt_hash, response_text, ttl) - if err then - ngx.log(ngx.ERR, "ai-cache: failed to write L1 cache: ", err) + if exact_enabled then + local err = exact.set(conf, scope_hash, prompt_hash, response_text, ttl_exact) + if err then + ngx.log(ngx.ERR, "ai-cache: failed to write L1 cache: ", err) + end + end + + if semantic_enabled then + local vec = embedding + + if not vec then + local emb_conf = conf.semantic.embedding + local emb_driver = require( + "apisix.plugins.ai-cache.embeddings." .. emb_conf.provider + ) + local httpc = http.new() + local emb, _, emb_err = emb_driver.get_embeddings( + emb_conf, prompt_text, httpc, true + ) + if not emb then + ngx.log(ngx.WARN, + "ai-cache: failed to get embedding for L2 store: ", emb_err) + return + end + vec = emb + end + + local ttl_semantic = (conf.semantic and conf.semantic.ttl) or 86400 + local store_err = semantic.store( + conf, scope_hash, vec, response_text, ttl_semantic + ) + if store_err then + ngx.log(ngx.WARN, "ai-cache: failed to write L2 cache: ", store_err) + end end end) end -return _M \ No newline at end of file +return _M diff --git a/apisix/plugins/ai-cache/semantic.lua b/apisix/plugins/ai-cache/semantic.lua new file mode 100644 index 000000000000..38ace604a617 --- /dev/null +++ b/apisix/plugins/ai-cache/semantic.lua @@ -0,0 +1,173 @@ +-- +-- Licensed to the Apache Software Foundation (ASF) under one or more +-- contributor license agreements. See the NOTICE file distributed with +-- this work for additional information regarding copyright ownership. +-- The ASF licenses this file to You under the Apache License, Version 2.0 +-- (the "License"); you may not use this file except in compliance with +-- the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. +-- + +local apisix_redis = require("apisix.utils.redis") +local uuid = require("resty.jit-uuid") +local ffi = require("ffi") + +local ffi_new = ffi.new +local ffi_string = ffi.string +local ngx_time = ngx.time +local tostring = tostring +local tonumber = tonumber +local type = type + +local INDEX_NAME = "ai-cache-idx" +local KEY_PREFIX = "ai-cache:l2:" + +local _M = {} + +local function pack_vector(vec) + local n = #vec + local buf = ffi_new("float[?]", n) + for i = 0, n - 1 do + buf[i] = vec[i + 1] + end + return ffi_string(buf, n * 4) +end + +local index_ready = false + +local function ensure_index(red, dim) + if index_ready then + return true + end + + local _, err = red["FT.CREATE"](red, + INDEX_NAME, + "ON", "HASH", + "PREFIX", "1", KEY_PREFIX, + "SCHEMA", + "embedding", "VECTOR", "HNSW", "6", + "TYPE", "FLOAT32", + "DIM", tostring(dim), + "DISTANCE_METRIC", "COSINE", + "scope", "TAG", + "created_at", "NUMERIC" + ) + + if err and not err:find("already exists") then + return nil, "FT.CREATE failed: " .. err + end + + index_ready = true + return true +end + + +function _M.search(conf, scope_hash, embedding_vec, threshold) + local red, err = apisix_redis.new(conf) + if not red then + return nil, nil, err + end + + local ok, init_err = ensure_index(red, #embedding_vec) + if not ok then + red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool) + return nil, nil, init_err + end + + local binary_vec = pack_vector(embedding_vec) + + local query + if scope_hash == "" then + query = "*=>[KNN 1 @embedding $vec AS dist]" + else + query = "@scope:{" .. scope_hash .. "} *=>[KNN 1 @embedding $vec AS dist]" + end + + local res, search_err = red["FT.SEARCH"](red, + INDEX_NAME, + query, + "PARAMS", "2", "vec", binary_vec, + "SORTBY", "dist", "ASC", + "LIMIT", "0", "1", + "RETURN", "2", "response", "dist", + "DIALECT", "2" + ) + red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool) + + if search_err then + return nil, nil, search_err + end + + if not res or res[1] == 0 then + return nil, nil, nil + end + + -- RESP2: {count, key, {field, val, field, val, ...}, ...} + local fields = res[3] + if type(fields) ~= "table" then + return nil, nil, nil + end + + local response_text, dist + for i = 1, #fields, 2 do + if fields[i] == "response" then + response_text = fields[i + 1] + elseif fields[i] == "dist" then + dist = tonumber(fields[i + 1]) + end + end + + if not response_text or not dist then + return nil, nil, nil + end + + local similarity = 1 - dist + if similarity < threshold then + return nil, nil, nil + end + + return response_text, similarity, nil +end + + +function _M.store(conf, scope_hash, embedding_vec, text, ttl) + local red, err = apisix_redis.new(conf) + if not red then + return err + end + + local ok, init_err = ensure_index(red, #embedding_vec) + if not ok then + red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool) + return init_err + end + + local binary_vec = pack_vector(embedding_vec) + local key = KEY_PREFIX .. uuid.generate_v4() + + local set_ok, set_err = red:hset(key, + "embedding", binary_vec, + "response", text, + "scope", scope_hash, + "created_at", tostring(ngx_time()) + ) + + if not set_ok then + red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool) + return set_err + end + + red:expire(key, ttl) + red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool) + return nil +end + + +return _M diff --git a/t/plugin/ai-cache.t b/t/plugin/ai-cache.t index a375d50bd9ca..f25379413c40 100644 --- a/t/plugin/ai-cache.t +++ b/t/plugin/ai-cache.t @@ -38,6 +38,29 @@ add_block_preprocessor(sub { $block->set_value("no_error_log", "[error]\n[alert]"); } + if (!defined $block->http_config) { + $block->set_value("http_config", <<_EOC_); +server { + listen 1990; + default_type 'application/json'; + + location /v1/embeddings { + content_by_lua_block { + local fixture_loader = require("lib.fixture_loader") + local content, err = fixture_loader.load("openai/embeddings-list.json") + if not content then + ngx.status = 500 + ngx.say(err) + return + end + + ngx.status = 200 + ngx.print(content) + } + } +} +_EOC_ + } }); run_tests(); @@ -372,7 +395,7 @@ X-AI-Cache-Status: MISS === TEST 15: openai driver - parses embedding vector correctly --- http_config server { - listen 1991; + listen 1990; default_type 'application/json'; location /v1/embeddings { @@ -406,7 +429,7 @@ server { local httpc = http.new() local conf = { - endpoint = "http://127.0.0.1:1991/v1/embeddings", + endpoint = "http://127.0.0.1:1990/v1/embeddings", api_key = "test-key", model = "text-embedding-3-small", } @@ -433,7 +456,7 @@ ok: 0.1 0.2 0.3 === TEST 16: openai driver - 429 from API return nil with status --- http_config server { - listen 1991; + listen 1990; default_type 'application/json'; location /v1/embeddings { @@ -451,7 +474,7 @@ server { local httpc = http.new() local conf = { - endpoint = "http://127.0.0.1:1991/v1/embeddings", + endpoint = "http://127.0.0.1:1990/v1/embeddings", api_key = "test-key", } @@ -472,7 +495,7 @@ status: 429 === TEST 17: azure_openai driver - parses embedding vector correctly --- http_config server { - listen 1991; + listen 1990; default_type 'application/json'; location /embeddings { @@ -503,7 +526,7 @@ server { local httpc = http.new() local conf = { - endpoint = "http://127.0.0.1:1991/embeddings", + endpoint = "http://127.0.0.1:1990/embeddings", api_key = "azure-test-key", } @@ -524,7 +547,7 @@ ok: 0.4 0.5 0.6 === TEST 18: openai driver - 500 from API returns nil with status --- http_config server { - listen 1991; + listen 1990; default_type 'application/json'; location /v1/embeddings { @@ -542,7 +565,7 @@ server { local httpc = http.new() local conf = { - endpoint = "http://127.0.0.1:1991/v1/embeddings", + endpoint = "http://127.0.0.1:1990/v1/embeddings", api_key = "test-key", } @@ -557,3 +580,148 @@ server { } --- response_body status: 500 + + + +=== TEST 19: clean up L2 state before semantic tests +--- config + location /t { + content_by_lua_block { + local redis = require("resty.redis") + local red = redis:new() + red:set_timeout(1000) + assert(red:connect("127.0.0.1", 6379)) + + red["FT.DROPINDEX"](red, "ai-cache-idx", "DD") + + local keys = red:keys("ai-cache:*") + if type(keys) == "table" and #keys > 0 then + red:del(unpack(keys)) + end + + red:close() + ngx.say("ok") + } + } +--- response_body +ok + + + +=== TEST 20: set up route for L2 semantic cache tests +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/3', + ngx.HTTP_PUT, + [[{ + "uri": "/semantic", + "plugins": { + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer test-key" + } + }, + "override": { + "endpoint": "http://127.0.0.1:1980/v1/chat/completions" + } + }, + "ai-cache": { + "layers": ["exact", "semantic"], + "exact": { + "ttl": 60 + }, + "semantic": { + "similarity_threshold": 0.90, + "ttl": 300, + "embedding": { + "provider": "openai", + "endpoint": "http://127.0.0.1:1990/v1/embeddings", + "api_key": "test-key" + } + }, + "redis_host": "127.0.0.1" + } + } + }]] + ) + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- response_body +passed + + + +=== TEST 21: L2 - first request, cache MISS, stored in L2 +--- request +POST /semantic +{"messages":[{"role":"user","content":"What is the capital of France??"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS + + + +=== TEST 22: L2 - different wording hits L2 (same vector from fixture) +--- request +POST /semantic +{"messages":[{"role":"user","content":"Name the capital city of France"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: HIT-L2 +--- response_body_like eval +qr/content/ +--- error_log +ai-cache: L2 hit + + + +=== TEST 23: L2 - original prompt now hits L1 (backfilled by the L2 hit) +--- request +POST /semantic +{"messages":[{"role":"user","content":"What is the capital of France??"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: HIT-L1 +--- error_log +ai-cache: L1 hit for key + + + +=== TEST 24: L2 degradation - search error results in MISS, not 500 +--- config + location /t { + content_by_lua_block { + local semantic = require("apisix.plugins.ai-cache.semantic") + local conf = { + redis_host = "127.0.0.1", + redis_port = 6379, + redis_timeout = 100, + } + + local text, sim, err = semantic.search(conf, "", {0.1, 0.2, 0.3}, 0.95) + if err then + ngx.say("degraded gracefully") + else + ngx.say("miss, no error") + end + } + } +--- response_body_like eval +qr/degraded gracefully|miss, no error/ From 8d64a394a4afd5fd5f264b71b86d08bbc72656b3 Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Thu, 30 Apr 2026 01:32:18 +0800 Subject: [PATCH 07/38] feat(ai-cache): set Content-Type text/event-stream on streaming cache hits --- apisix/plugins/ai-cache.lua | 10 +++++++-- t/plugin/ai-cache.t | 44 +++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua index 216291692ca9..d0b8e84a9298 100644 --- a/apisix/plugins/ai-cache.lua +++ b/apisix/plugins/ai-cache.lua @@ -109,6 +109,8 @@ function _M.access(conf, ctx) return end + local is_stream = body_tab.stream == true + -- L1 exact lookup if layer_enabled(conf, "exact") then local cached_text, written_at, lookup_err = exact.get(conf, scope_hash, prompt_hash) @@ -118,7 +120,9 @@ function _M.access(conf, ctx) core.log.info("ai-cache: L1 hit for key: ", prompt_hash) ctx.ai_cache_status = "HIT-L1" ctx.ai_cache_written_at = written_at - local is_stream = body_tab.stream == true + if is_stream then + core.response.set_header("Content-Type", "text/event-stream") + end return core.response.exit(200, proto.build_deny_response({ stream = is_stream, text = cached_text, @@ -157,7 +161,9 @@ function _M.access(conf, ctx) ctx.ai_cache_status = "HIT-L2" ctx.ai_cache_similarity = similarity - local is_stream = body_tab.stream == true + if is_stream then + core.response.set_header("Content-Type", "text/event-stream") + end return core.response.exit(200, proto.build_deny_response({ stream = is_stream, text = cached_text, diff --git a/t/plugin/ai-cache.t b/t/plugin/ai-cache.t index f25379413c40..11aed3bc2da3 100644 --- a/t/plugin/ai-cache.t +++ b/t/plugin/ai-cache.t @@ -725,3 +725,47 @@ ai-cache: L1 hit for key } --- response_body_like eval qr/degraded gracefully|miss, no error/ + + + +=== TEST 25: streaming MISS - upstream called, response cached via log phase +--- request +POST /chat +{"messages":[{"role":"user","content":"Stream me something cool"}],"stream":true} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-streaming.sse +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS + + + +=== TEST 26: streaming HIT - Content-Type is text/event-stream, SSE body returned +--- request +POST /chat +{"messages":[{"role":"user","content":"Stream me something cool"}],"stream":true} +--- more_headers +Content-Type: application/json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: HIT-L1 +Content-Type: text/event-stream +--- response_body_like eval +qr/data:.*content/ +--- wait: 1 + + + +=== TEST 27: non-streaming HIT after streaming MISS - returns JSON +--- request +POST /chat +{"messages":[{"role":"user","content":"Stream me something cool"}]} +--- more_headers +Content-Type: application/json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: HIT-L1 +Content-Type: application/json +--- response_body_like eval +qr/content/ From 6f72c377dc55728e095d79e733901d3557978f0d Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Thu, 30 Apr 2026 04:32:12 +0800 Subject: [PATCH 08/38] feat(ai-cache): add Prometheus metrics for hits, misses, and embedding latency --- apisix/plugins/ai-cache.lua | 19 +- apisix/plugins/prometheus/exporter.lua | 53 +++++ t/plugin/prometheus-ai-cache.t | 305 +++++++++++++++++++++++++ 3 files changed, 369 insertions(+), 8 deletions(-) create mode 100644 t/plugin/prometheus-ai-cache.t diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua index d0b8e84a9298..94de0043c123 100644 --- a/apisix/plugins/ai-cache.lua +++ b/apisix/plugins/ai-cache.lua @@ -22,6 +22,7 @@ local semantic = require("apisix.plugins.ai-cache.semantic") local protocols = require("apisix.plugins.ai-protocols") local http = require("resty.http") local ngx_time = ngx.time +local ngx_now = ngx.now local tostring = tostring local table_concat = table.concat @@ -78,7 +79,6 @@ function _M.access(conf, ctx) local body_tab, err = core.request.get_json_request_body_table() if not body_tab then core.log.warn("ai-cache: failed to read request body: ", err or "unknown error") - ctx.ai_cache_miss = true ctx.ai_cache_status = "MISS" return end @@ -86,7 +86,6 @@ function _M.access(conf, ctx) local protocol_name = protocols.detect(body_tab, ctx) if not protocol_name then core.log.warn("ai-cache: could not detect AI protocol, skipping cache") - ctx.ai_cache_miss = true ctx.ai_cache_status = "MISS" return end @@ -94,7 +93,6 @@ function _M.access(conf, ctx) local proto = protocols.get(protocol_name) local contents = proto.extract_request_content(body_tab) if not contents or #contents == 0 then - ctx.ai_cache_miss = true ctx.ai_cache_status = "MISS" return end @@ -104,7 +102,6 @@ function _M.access(conf, ctx) local prompt_hash, hash_err = exact.compute_prompt_hash(prompt_text) if not prompt_hash then core.log.warn("ai-cache: failed to compute prompt hash: ", hash_err) - ctx.ai_cache_miss = true ctx.ai_cache_status = "MISS" return end @@ -122,6 +119,8 @@ function _M.access(conf, ctx) ctx.ai_cache_written_at = written_at if is_stream then core.response.set_header("Content-Type", "text/event-stream") + else + core.response.set_header("Content-Type", "application/json") end return core.response.exit(200, proto.build_deny_response({ stream = is_stream, @@ -136,10 +135,13 @@ function _M.access(conf, ctx) local emb_driver = require("apisix.plugins.ai-cache.embeddings." .. emb_conf.provider) local httpc = http.new() + local t0 = ngx_now() local embedding, _, emb_err = emb_driver.get_embeddings(emb_conf, prompt_text, httpc, true) if not embedding then core.log.warn("ai-cache: embedding fetch failed (degrading to MISS): ", emb_err) else + ctx.ai_cache_embedding_latency_ms = (ngx_now() - t0) * 1000 + ctx.ai_cache_embedding_provider = emb_conf.provider ctx.ai_cache_embedding = embedding local threshold = conf.semantic.similarity_threshold or 0.95 @@ -163,6 +165,8 @@ function _M.access(conf, ctx) ctx.ai_cache_similarity = similarity if is_stream then core.response.set_header("Content-Type", "text/event-stream") + else + core.response.set_header("Content-Type", "application/json") end return core.response.exit(200, proto.build_deny_response({ stream = is_stream, @@ -172,7 +176,6 @@ function _M.access(conf, ctx) end end - ctx.ai_cache_miss = true ctx.ai_cache_status = "MISS" ctx.ai_cache_scope_hash = scope_hash ctx.ai_cache_prompt_hash = prompt_hash @@ -204,12 +207,12 @@ end function _M.log(conf, ctx) - if not ctx.ai_cache_miss or ctx.ai_cache_bypass then + if ctx.ai_cache_status ~= "MISS" then return end - local status = core.response.get_upstream_status(ctx) or ngx.status - if not status or status < 200 or status >= 300 then + local upstream_status = core.response.get_upstream_status(ctx) or ngx.status + if not upstream_status or upstream_status < 200 or upstream_status >= 300 then return end diff --git a/apisix/plugins/prometheus/exporter.lua b/apisix/plugins/prometheus/exporter.lua index ce89ca03302a..cf008400b0f0 100644 --- a/apisix/plugins/prometheus/exporter.lua +++ b/apisix/plugins/prometheus/exporter.lua @@ -160,6 +160,12 @@ function _M.http_init(prometheus_enabled_in_stream) "llm_completion_tokens", "expire") local llm_active_connections_exptime = core.table.try_read_attr(attr, "metrics", "llm_active_connections", "expire") + local ai_cache_hits_exptime = core.table.try_read_attr(attr, "metrics", + "ai_cache_hits", "expire") + local ai_cache_misses_exptime = core.table.try_read_attr(attr, "metrics", + "ai_cache_misses", "expire") + local ai_cache_embedding_latency_exptime = core.table.try_read_attr(attr, "metrics", + "ai_cache_embedding_latency", "expire") prometheus = base_prometheus.init("prometheus-metrics", metric_prefix) @@ -260,6 +266,29 @@ function _M.http_init(prometheus_enabled_in_stream) unpack(extra_labels("llm_active_connections"))}, llm_active_connections_exptime) + metrics.ai_cache_hits = prometheus:counter("ai_cache_hits_total", + "AI cache hit count by layer", + {"route_id", "service_id", "consumer", "layer", + unpack(extra_labels("ai_cache_hits"))}, + ai_cache_hits_exptime) + + metrics.ai_cache_misses = prometheus:counter("ai_cache_misses_total", + "AI cache miss count", + {"route_id", "service_id", "consumer", + unpack(extra_labels("ai_cache_misses"))}, + ai_cache_misses_exptime) + + local ai_cache_embedding_latency_buckets = DEFAULT_BUCKETS + if attr and attr.ai_cache_embedding_latency_buckets then + ai_cache_embedding_latency_buckets = attr.ai_cache_embedding_latency_buckets + end + metrics.ai_cache_embedding_latency = prometheus:histogram("ai_cache_embedding_latency", + "AI cache embedding API call latency in milliseconds", + {"route_id", "service_id", "consumer", "provider", + unpack(extra_labels("ai_cache_embedding_latency"))}, + ai_cache_embedding_latency_buckets, + ai_cache_embedding_latency_exptime) + if prometheus_enabled_in_stream then init_stream_metrics() end @@ -377,6 +406,29 @@ function _M.http_log(conf, ctx) vars.request_type, vars.request_llm_model, vars.llm_model, unpack(extra_labels("llm_completion_tokens", ctx)))) end + + if ctx.ai_cache_status then + if ctx.ai_cache_status == "HIT-L1" then + metrics.ai_cache_hits:inc(1, + gen_arr(route_id, service_id, consumer_name, "l1", + unpack(extra_labels("ai_cache_hits", ctx)))) + elseif ctx.ai_cache_status == "HIT-L2" then + metrics.ai_cache_hits:inc(1, + gen_arr(route_id, service_id, consumer_name, "l2", + unpack(extra_labels("ai_cache_hits", ctx)))) + elseif ctx.ai_cache_status == "MISS" then + metrics.ai_cache_misses:inc(1, + gen_arr(route_id, service_id, consumer_name, + unpack(extra_labels("ai_cache_misses", ctx)))) + end + + if ctx.ai_cache_embedding_latency_ms then + metrics.ai_cache_embedding_latency:observe(ctx.ai_cache_embedding_latency_ms, + gen_arr(route_id, service_id, consumer_name, + ctx.ai_cache_embedding_provider or "", + unpack(extra_labels("ai_cache_embedding_latency", ctx)))) + end + end end @@ -790,6 +842,7 @@ function _M.dec_llm_active_connections(ctx) inc_llm_active_connections(ctx, -1) end + function _M.get_prometheus() return prometheus end diff --git a/t/plugin/prometheus-ai-cache.t b/t/plugin/prometheus-ai-cache.t new file mode 100644 index 000000000000..fc4eb02264bc --- /dev/null +++ b/t/plugin/prometheus-ai-cache.t @@ -0,0 +1,305 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +BEGIN { + $ENV{TEST_ENABLE_CONTROL_API_V1} = "0"; + + if ($ENV{TEST_NGINX_CHECK_LEAK}) { + $SkipReason = "unavailable for the hup tests"; + } else { + $ENV{TEST_NGINX_USE_HUP} = 1; + undef $ENV{TEST_NGINX_USE_STAP}; + } +} + +use t::APISIX 'no_plan'; + +repeat_each(1); +no_long_string(); +no_shuffle(); +no_root_location(); + +add_block_preprocessor(sub { + my ($block) = @_; + + if (!defined $block->request) { + $block->set_value("request", "GET /t"); + } + + my $user_yaml_config = <<_EOC_; +plugin_attr: + prometheus: + refresh_interval: 0.1 +plugins: + - ai-proxy + - ai-cache + - prometheus + - public-api +_EOC_ + $block->set_value("extra_yaml_config", $user_yaml_config); + + if (!defined $block->http_config) { + $block->set_value("http_config", <<_EOC_); +server { + listen 1990; + default_type 'application/json'; + + location /v1/embeddings { + content_by_lua_block { + local fixture_loader = require("lib.fixture_loader") + local content, err = fixture_loader.load("openai/embeddings-list.json") + if not content then + ngx.status = 500 + ngx.say(err) + return + end + + ngx.status = 200 + ngx.print(content) + } + } +} +_EOC_ + } +}); + +run_tests; + +__DATA__ + +=== TEST 1: set up routes +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + + local routes = { + { + url = "/apisix/admin/routes/1", + data = [[{ + "uri": "/chat", + "plugins": { + "prometheus": {}, + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer test-key" + } + }, + "override": { + "endpoint": "http://127.0.0.1:1980/v1/chat/completions" + } + }, + "ai-cache": { + "layers": ["exact"], + "exact": { "ttl": 60 }, + "redis_host": "127.0.0.1", + "bypass_on": [{"header": "X-Cache-Bypass", "equals": "1"}] + } + } + }]], + }, + { + url = "/apisix/admin/routes/2", + data = [[{ + "uri": "/semantic", + "plugins": { + "prometheus": {}, + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer test-key" + } + }, + "override": { + "endpoint": "http://127.0.0.1:1980/v1/chat/completions" + } + }, + "ai-cache": { + "layers": ["exact", "semantic"], + "exact": { "ttl": 60 }, + "semantic": { + "similarity_threshold": 0.90, + "ttl": 300, + "embedding": { + "provider": "openai", + "endpoint": "http://127.0.0.1:1990/v1/embeddings", + "api_key": "test-key" + } + }, + "redis_host": "127.0.0.1" + } + } + }]], + }, + { + url = "/apisix/admin/routes/metrics", + data = [[{ + "plugins": { + "public-api": {} + }, + "uri": "/apisix/prometheus/metrics" + }]], + }, + } + + for _, route in ipairs(routes) do + local code, body = t(route.url, ngx.HTTP_PUT, route.data) + if code >= 300 then + ngx.status = code + end + ngx.say(body) + end + } + } +--- response_body eval +"passed\n" x 3 + + + +=== TEST 2: MISS request - upstream called +--- request +POST /chat +{"messages":[{"role":"user","content":"What is the meaning of life?"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS + + + +=== TEST 3: same request - HIT-L1 +--- request +POST /chat +{"messages":[{"role":"user","content":"What is the meaning of life?"}]} +--- more_headers +Content-Type: application/json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: HIT-L1 +--- wait: 1 + + + +=== TEST 4: verify miss counter +--- request +GET /apisix/prometheus/metrics +--- response_body_like eval +qr/apisix_ai_cache_misses_total\{route_id="1",service_id="",consumer=""\} 1/ + + + +=== TEST 5: verify hit counter with layer label +--- request +GET /apisix/prometheus/metrics +--- response_body_like eval +qr/apisix_ai_cache_hits_total\{route_id="1",service_id="",consumer="",layer="l1"\} 1/ + + + +=== TEST 6: BYPASS request - upstream called, no cache interaction +--- request +POST /chat +{"messages":[{"role":"user","content":"What is the meaning of life?"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +X-Cache-Bypass: 1 +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: BYPASS + + + +=== TEST 7: verify BYPASS did not increment misses counter +--- request +GET /apisix/prometheus/metrics +--- response_body_like eval +qr/apisix_ai_cache_misses_total\{route_id="1",service_id="",consumer=""\} 1\n/ + + + +=== TEST 8: cleanup Redis L2 state before semantic tests +--- config + location /t { + content_by_lua_block { + local redis = require("resty.redis") + local red = redis:new() + red:set_timeout(1000) + assert(red:connect("127.0.0.1", 6379)) + + red["FT.DROPINDEX"](red, "ai-cache-idx", "DD") + + local keys = red:keys("ai-cache:*") + if type(keys) == "table" and #keys > 0 then + red:del(unpack(keys)) + end + + red:close() + ngx.say("ok") + } + } +--- response_body +ok + + + +=== TEST 9: L2 first request - MISS, embedding API called +--- request +POST /semantic +{"messages":[{"role":"user","content":"What is the capital of France??"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS +--- wait: 1 + + + +=== TEST 10: L2 second request - different wording, HIT-L2 +--- request +POST /semantic +{"messages":[{"role":"user","content":"Name the capital city of France"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: HIT-L2 +--- wait: 1 + + + +=== TEST 11: verify hits counter with layer="l2" +--- request +GET /apisix/prometheus/metrics +--- response_body_like eval +qr/apisix_ai_cache_hits_total\{route_id="2",service_id="",consumer="",layer="l2"\} 1/ + + + +=== TEST 12: verify embedding latency histogram with provider label +--- request +GET /apisix/prometheus/metrics +--- response_body_like eval +qr/apisix_ai_cache_embedding_latency_count\{route_id="2",service_id="",consumer="",provider="openai"\} 2/ From 4cf5c1c0bbeaa426ce1e3014bd7ca3ece2939e3c Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Thu, 30 Apr 2026 06:35:10 +0800 Subject: [PATCH 09/38] docs(ai-cache): add plugin documentation --- apisix/plugins/ai-cache.lua | 10 +- docs/en/latest/config.json | 1 + docs/en/latest/plugins/ai-cache.md | 656 +++++++++++++++++++++++++++++ 3 files changed, 666 insertions(+), 1 deletion(-) create mode 100644 docs/en/latest/plugins/ai-cache.md diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua index 94de0043c123..7caf6cf548ee 100644 --- a/apisix/plugins/ai-cache.lua +++ b/apisix/plugins/ai-cache.lua @@ -221,6 +221,14 @@ function _M.log(conf, ctx) return end + local max_size = conf.max_cache_body_size or 1048576 + if #response_text > max_size then + core.log.warn("ai-cache: response size ", #response_text, + " exceeds max_cache_body_size ", max_size, + ", skipping cache write") + return + end + local exact_enabled = layer_enabled(conf, "exact") local semantic_enabled = layer_enabled(conf, "semantic") local ttl_exact = (conf.exact and conf.exact.ttl) or 3600 @@ -233,7 +241,7 @@ function _M.log(conf, ctx) if premature then return end - + if exact_enabled then local err = exact.set(conf, scope_hash, prompt_hash, response_text, ttl_exact) if err then diff --git a/docs/en/latest/config.json b/docs/en/latest/config.json index d24eacc3f8e9..c198826c7505 100644 --- a/docs/en/latest/config.json +++ b/docs/en/latest/config.json @@ -75,6 +75,7 @@ "plugins/ai-proxy-multi", "plugins/ai-rate-limiting", "plugins/ai-prompt-guard", + "plugins/ai-cache", "plugins/ai-aws-content-moderation", "plugins/ai-aliyun-content-moderation", "plugins/ai-prompt-decorator", diff --git a/docs/en/latest/plugins/ai-cache.md b/docs/en/latest/plugins/ai-cache.md new file mode 100644 index 000000000000..71de88a32261 --- /dev/null +++ b/docs/en/latest/plugins/ai-cache.md @@ -0,0 +1,656 @@ +--- +title: ai-cache +keywords: + - Apache APISIX + - API Gateway + - Plugin + - ai-cache +description: The ai-cache Plugin caches LLM responses in Redis so identical or semantically similar prompts are served from cache, reducing latency and upstream cost. +--- + + + + + + + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Description + +The `ai-cache` Plugin caches LLM responses in Redis so identical or semantically similar prompts are served from cache instead of incurring another upstream call. It supports two cache layers: an exact-match layer (`exact`) keyed by a hash of the prompt, and a semantic layer (`semantic`) that compares prompt embeddings via Redis Stack vector search. Either layer can be enabled independently, and a hit on the semantic layer backfills the exact layer so subsequent identical prompts return immediately. + +The Plugin should be used together with [ai-proxy](./ai-proxy.md) or [ai-proxy-multi](./ai-proxy-multi.md) on the same Route. The semantic layer requires Redis Stack with the RediSearch module and an embedding provider (OpenAI or Azure OpenAI). + +## Plugin Attributes + +| Name | Type | Required | Default | Valid values | Description | +| --- | --- | --- | --- | --- | --- | +| `layers` | array[string] | False | `["exact", "semantic"]` | `"exact"`, `"semantic"` | Cache layers to enable, queried in order. | +| `exact.ttl` | integer | False | `3600` | ≥ 1 | Time-to-live in seconds for exact-layer entries. | +| `semantic.similarity_threshold` | number | False | `0.95` | 0–1 | Minimum cosine similarity required for a semantic-layer hit. | +| `semantic.ttl` | integer | False | `86400` | ≥ 1 | Time-to-live in seconds for semantic-layer entries. | +| `semantic.embedding.provider` | string | True (if semantic enabled) | | `"openai"`, `"azure_openai"` | Embedding API provider. | +| `semantic.embedding.endpoint` | string | True (if semantic enabled) | | | HTTPS URL of the embedding API. | +| `semantic.embedding.api_key` | string | True (if semantic enabled) | | | API key for the embedding provider. Stored encrypted. | +| `semantic.embedding.model` | string | False | | | Embedding model name. Uses provider default if omitted. | +| `cache_key.include_consumer` | boolean | False | `false` | | If `true`, partition the cache by consumer name. | +| `cache_key.include_vars` | array[string] | False | `[]` | | Additional `ctx.var` names included in the cache key, for example `["$http_x_tenant_id"]`. | +| `bypass_on` | array[object] | False | | | List of `{header, equals}` rules. If any matches, the request bypasses the cache. | +| `max_cache_body_size` | integer | False | `1048576` | ≥ 1 | Maximum response size in bytes to write to cache. Larger responses pass through but are not cached. | +| `headers.cache_status` | string | False | `"X-AI-Cache-Status"` | | Response header for cache status (`HIT-L1`, `HIT-L2`, `MISS`, `BYPASS`). | +| `headers.cache_age` | string | False | `"X-AI-Cache-Age"` | | Response header for the age in seconds of an exact-layer hit. | +| `headers.cache_similarity` | string | False | `"X-AI-Cache-Similarity"` | | Response header for the similarity score of a semantic-layer hit. | + +Redis connection fields (`redis_host`, `redis_port`, `redis_password`, `redis_database`, `redis_timeout`, `redis_ssl`, `redis_ssl_verify`, `redis_username`, `redis_keepalive_timeout`, `redis_keepalive_pool`) follow the shared Redis schema. At minimum, `redis_host` is required. + +## Examples + +The following examples use OpenAI as the Upstream service provider. Before proceeding, create an [OpenAI account](https://openai.com) and an [API key](https://openai.com/blog/openai-api). You can optionally save the key to an environment variable: + +```shell +export OPENAI_API_KEY= +``` + +If you are working with other LLM providers, please refer to the provider's documentation to obtain an API key. + +:::note + +You can fetch the `admin_key` from `config.yaml` and save to an environment variable with the following command: + +```shell +admin_key=$(yq '.deployment.admin.admin_key[0].key' conf/config.yaml | sed 's/"//g') +``` + +::: + +### Cache Identical Prompts with the Exact Layer + +The following example demonstrates how to use the `ai-cache` Plugin with the exact layer only, so that identical prompts are returned from cache. + + + + +Create a Route that uses [ai-proxy](./ai-proxy.md) to proxy to OpenAI and `ai-cache` to cache exact-match prompts: + +```shell +curl "http://127.0.0.1:9180/apisix/admin/routes/1" -X PUT \ + -H "X-API-KEY: ${admin_key}" \ + -d '{ + "uri": "/anything", + "methods": ["POST"], + "plugins": { + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer '"$OPENAI_API_KEY"'" + } + }, + "options": { + "model": "gpt-4" + } + }, + "ai-cache": { + "layers": ["exact"], + "exact": { "ttl": 3600 }, + "redis_host": "127.0.0.1" + } + } + }' +``` + + + + +Create a Route with the `ai-cache` and [ai-proxy](./ai-proxy.md) Plugins configured as such: + +```yaml title="adc.yaml" +services: + - name: ai-cache-service + routes: + - name: ai-cache-route + uris: + - /anything + methods: + - POST + plugins: + ai-proxy: + provider: openai + auth: + header: + Authorization: "Bearer ${OPENAI_API_KEY}" + options: + model: gpt-4 + ai-cache: + layers: + - exact + exact: + ttl: 3600 + redis_host: 127.0.0.1 +``` + +Synchronize the configuration to the gateway: + +```shell +adc sync -f adc.yaml +``` + + + + +Send a request to the Route: + +```shell +curl -i "http://127.0.0.1:9080/anything" -X POST \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [ + { "role": "user", "content": "What is the capital of France?" } + ] + }' +``` + +The first request reaches OpenAI and you should receive a response similar to the following: + +```text +HTTP/1.1 200 OK +Content-Type: application/json +Server: APISIX/3.16.0 +X-AI-Cache-Status: MISS + +{ + "id": "chatcmpl-Da7Iqsqz9gc8Mkf07Hn4NCzAH5Ri1", + "object": "chat.completion", + "created": 1777500252, + "model": "gpt-4o-mini-2024-07-18", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "The capital of France is Paris.", + "refusal": null + }, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 14, + "completion_tokens": 7, + "total_tokens": 21 + }, + "system_fingerprint": "fp_d3214ccada" +} +``` + +Send the same request again: + +```shell +curl -i "http://127.0.0.1:9080/anything" -X POST \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [ + { "role": "user", "content": "What is the capital of France?" } + ] + }' +``` + +The second request returns from cache without contacting OpenAI. The cached response is replayed from Redis, so the body is shorter and does not contain the original `created`, `model`, `usage`, or `system_fingerprint` fields: + +```text +HTTP/1.1 200 OK +Content-Type: application/json +Server: APISIX/3.16.0 +X-AI-Cache-Status: HIT-L1 +X-AI-Cache-Age: 4 + +{ + "id": "f558665e-3a03-42e3-9aa9-f54c402927c0", + "object": "chat.completion", + "choices": [ + { + "index": 0, + "message": { + "content": "The capital of France is Paris.", + "role": "assistant" + }, + "finish_reason": "stop" + } + ] +} +``` + +### Cache Paraphrased Prompts with the Semantic Layer + +The following example demonstrates how to enable the semantic layer so that prompts with different wording but similar meaning are served from cache. + + + + +```shell +curl "http://127.0.0.1:9180/apisix/admin/routes/1" -X PUT \ + -H "X-API-KEY: ${admin_key}" \ + -d '{ + "uri": "/anything", + "methods": ["POST"], + "plugins": { + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer '"$OPENAI_API_KEY"'" + } + }, + "options": { + "model": "gpt-4" + } + }, + "ai-cache": { + "layers": ["exact", "semantic"], + "exact": { "ttl": 3600 }, + "semantic": { + "similarity_threshold": 0.92, + "ttl": 86400, + "embedding": { + "provider": "openai", + "endpoint": "https://api.openai.com/v1/embeddings", + "api_key": "'"$OPENAI_API_KEY"'", + "model": "text-embedding-3-small" + } + }, + "redis_host": "127.0.0.1" + } + } + }' +``` + + + + +```yaml title="adc.yaml" +services: + - name: ai-cache-service + routes: + - name: ai-cache-route + uris: + - /anything + methods: + - POST + plugins: + ai-proxy: + provider: openai + auth: + header: + Authorization: "Bearer ${OPENAI_API_KEY}" + options: + model: gpt-4 + ai-cache: + layers: + - exact + - semantic + exact: + ttl: 3600 + semantic: + similarity_threshold: 0.92 + ttl: 86400 + embedding: + provider: openai + endpoint: https://api.openai.com/v1/embeddings + api_key: "${OPENAI_API_KEY}" + model: text-embedding-3-small + redis_host: 127.0.0.1 +``` + +Synchronize the configuration to the gateway: + +```shell +adc sync -f adc.yaml +``` + + + + +Send a first request: + +```shell +curl -i "http://127.0.0.1:9080/anything" -X POST \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [ + { "role": "user", "content": "What is the capital of France?" } + ] + }' +``` + +The first request reaches OpenAI: + +```text +HTTP/1.1 200 OK +Content-Type: application/json +Server: APISIX/3.16.0 +X-AI-Cache-Status: MISS + +{ + "id": "chatcmpl-Da7Iqsqz9gc8Mkf07Hn4NCzAH5Ri1", + "object": "chat.completion", + "model": "gpt-4o-mini-2024-07-18", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "The capital of France is Paris." + }, + "finish_reason": "stop" + } + ], + "usage": { "prompt_tokens": 14, "completion_tokens": 7, "total_tokens": 21 } +} +``` + +Wait a couple of seconds for the semantic-layer write to complete in the background, then send a second request with paraphrased wording: + +```shell +curl -i "http://127.0.0.1:9080/anything" -X POST \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [ + { "role": "user", "content": "capital of France what is?" } + ] + }' +``` + +The semantic layer matches the embedding (cosine similarity above the threshold) and returns the cached response without contacting OpenAI: + +```text +HTTP/1.1 200 OK +Content-Type: application/json +Server: APISIX/3.16.0 +X-AI-Cache-Status: HIT-L2 +X-AI-Cache-Similarity: 0.9720680713654 + +{ + "id": "40b612a5-1424-4096-b7ec-8537a1ee6fd3", + "object": "chat.completion", + "choices": [ + { + "index": 0, + "message": { + "content": "The capital of France is Paris.", + "role": "assistant" + }, + "finish_reason": "stop" + } + ] +} +``` + +A semantic-layer hit also backfills the exact layer, so an immediate retry of the same paraphrase returns `X-AI-Cache-Status: HIT-L1`. + +### Isolate Cache Entries Per Consumer or Tenant + +The following example demonstrates how to namespace cache entries so that one consumer's response is not served to another. Use `cache_key.include_consumer` to partition by consumer name, or `cache_key.include_vars` to include request variables such as a tenant header. + + + + +```shell +curl "http://127.0.0.1:9180/apisix/admin/routes/1" -X PUT \ + -H "X-API-KEY: ${admin_key}" \ + -d '{ + "uri": "/anything", + "methods": ["POST"], + "plugins": { + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer '"$OPENAI_API_KEY"'" + } + }, + "options": { + "model": "gpt-4" + } + }, + "ai-cache": { + "layers": ["exact"], + "exact": { "ttl": 3600 }, + "cache_key": { + "include_consumer": true, + "include_vars": ["$http_x_tenant_id"] + }, + "redis_host": "127.0.0.1" + } + } + }' +``` + + + + +```yaml title="adc.yaml" +services: + - name: ai-cache-service + routes: + - name: ai-cache-route + uris: + - /anything + methods: + - POST + plugins: + ai-proxy: + provider: openai + auth: + header: + Authorization: "Bearer ${OPENAI_API_KEY}" + options: + model: gpt-4 + ai-cache: + layers: + - exact + exact: + ttl: 3600 + cache_key: + include_consumer: true + include_vars: + - "$http_x_tenant_id" + redis_host: 127.0.0.1 +``` + +Synchronize the configuration to the gateway: + +```shell +adc sync -f adc.yaml +``` + + + + +Two requests with the same prompt but different `X-Tenant-Id` headers each receive `X-AI-Cache-Status: MISS`, because the cache key now includes the tenant identifier. + +### Bypass the Cache on a Header + +The following example demonstrates how to skip the cache entirely when a request carries a specific header, for example to refresh a cached response or to support staff debugging. + + + + +```shell +curl "http://127.0.0.1:9180/apisix/admin/routes/1" -X PUT \ + -H "X-API-KEY: ${admin_key}" \ + -d '{ + "uri": "/anything", + "methods": ["POST"], + "plugins": { + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer '"$OPENAI_API_KEY"'" + } + }, + "options": { + "model": "gpt-4" + } + }, + "ai-cache": { + "layers": ["exact"], + "exact": { "ttl": 3600 }, + "bypass_on": [ + { "header": "X-Cache-Bypass", "equals": "1" } + ], + "redis_host": "127.0.0.1" + } + } + }' +``` + + + + +```yaml title="adc.yaml" +services: + - name: ai-cache-service + routes: + - name: ai-cache-route + uris: + - /anything + methods: + - POST + plugins: + ai-proxy: + provider: openai + auth: + header: + Authorization: "Bearer ${OPENAI_API_KEY}" + options: + model: gpt-4 + ai-cache: + layers: + - exact + exact: + ttl: 3600 + bypass_on: + - header: X-Cache-Bypass + equals: "1" + redis_host: 127.0.0.1 +``` + +Synchronize the configuration to the gateway: + +```shell +adc sync -f adc.yaml +``` + + + + +Send a request with the bypass header: + +```shell +curl -i "http://127.0.0.1:9080/anything" -X POST \ + -H "Content-Type: application/json" \ + -H "X-Cache-Bypass: 1" \ + -d '{ + "messages": [ + { "role": "user", "content": "What is the capital of France?" } + ] + }' +``` + +The request reaches OpenAI even though a cached entry exists, and the response is not written back to cache. You can confirm the upstream was contacted because the response includes the original `created`, `model`, `usage`, and `system_fingerprint` fields: + +```text +HTTP/1.1 200 OK +Content-Type: application/json +Server: APISIX/3.16.0 +X-AI-Cache-Status: BYPASS + +{ + "id": "chatcmpl-Da7N4E9fA6KoQ7av98hL0zxplPCcD", + "object": "chat.completion", + "created": 1777500514, + "model": "gpt-4o-mini-2024-07-18", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "The capital of France is Paris.", + "refusal": null + }, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 14, + "completion_tokens": 7, + "total_tokens": 21 + }, + "system_fingerprint": "fp_d3214ccada" +} +``` + +## Caveats + +### The semantic-layer write is asynchronous + +After a `MISS`, the embedding fetch and Redis vector store happen in a background timer. If you send a paraphrased prompt immediately after the first request, you may see another `MISS` because the entry has not been stored yet. Wait a couple of seconds before sending a paraphrase to verify a semantic hit. + +### Similarity is mathematical, not human-judged + +Two prompts that look semantically equivalent to a human can score below the configured `similarity_threshold` and therefore miss the cache. Conversely, a small wording change can flip the result. For example, with `similarity_threshold` set to `0.85` and the cache primed with `"What is the capital of France?"`: + +| Prompt | Status | Similarity | +|--------|--------|------------| +| `capital of France?` | `HIT-L2` | `0.850` | +| `capital of France what?` | `MISS` | (below threshold) | +| `capital of France what is?` | `HIT-L2` | `0.972` | +| `capital of France what please?` | `HIT-L2` | `0.924` | +| `capital of France what is please tell me?` | `MISS` | (below threshold) | + +Lower the threshold to catch more paraphrases at the cost of occasionally serving a cached answer for a genuinely different question. Tune empirically against your traffic. + +### Embedding model dimensions are baked into the index + +Redis Stack creates the vector index on the first request with a fixed `DIM` matching the embedding vector size (for example `1536` for `text-embedding-3-small`, `3072` for `text-embedding-3-large`). If you switch embedding models, or if the index was created with different-sized vectors during testing, subsequent requests will fail with a size-mismatch error in the APISIX warn log: + +```text +ai-cache: L2 search error: Error parsing vector similarity query: +query vector blob size (6144) does not match index's expected size (12). +``` + +The Plugin degrades to `MISS` so requests still succeed, but the semantic layer effectively stops working. Drop the index to recover; it will be recreated on the next request with the correct dimension: + +```shell +docker exec redis-cli FT.DROPINDEX ai-cache-idx DD +docker exec redis-cli --raw KEYS "ai-cache:*" \ + | xargs -r docker exec -i redis-cli DEL +``` + +### `BYPASS` does not refresh the cache + +A request with the bypass header reaches the upstream but its response is not written back. Use it to force a fresh upstream call without invalidating or replacing the existing cached entry. + +### The semantic layer requires Redis Stack + +The `FT.CREATE` and `FT.SEARCH` commands used by the semantic layer come from the RediSearch module. Vanilla Redis will fail these commands and the layer will silently degrade to `MISS`. Use a Redis Stack image such as `redis/redis-stack:latest`. From 62850aa92e165503b37420d1c5f7d5b6b99dae60 Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Thu, 30 Apr 2026 06:55:35 +0800 Subject: [PATCH 10/38] feat(ai-cache): segregate L2 index by embedding dimension --- apisix/plugins/ai-cache/semantic.lua | 27 +++++++++++++++++---------- t/plugin/ai-cache.t | 1 + 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/apisix/plugins/ai-cache/semantic.lua b/apisix/plugins/ai-cache/semantic.lua index 38ace604a617..38055daad906 100644 --- a/apisix/plugins/ai-cache/semantic.lua +++ b/apisix/plugins/ai-cache/semantic.lua @@ -26,11 +26,18 @@ local tostring = tostring local tonumber = tonumber local type = type -local INDEX_NAME = "ai-cache-idx" -local KEY_PREFIX = "ai-cache:l2:" - local _M = {} + +local function index_name(dim) + return "ai-cache-idx-" .. dim +end + + +local function key_prefix(dim) + return "ai-cache:l2:" .. dim .. ":" +end + local function pack_vector(vec) local n = #vec local buf = ffi_new("float[?]", n) @@ -40,17 +47,17 @@ local function pack_vector(vec) return ffi_string(buf, n * 4) end -local index_ready = false +local index_ready = {} local function ensure_index(red, dim) - if index_ready then + if index_ready[dim] then return true end local _, err = red["FT.CREATE"](red, - INDEX_NAME, + index_name(dim), "ON", "HASH", - "PREFIX", "1", KEY_PREFIX, + "PREFIX", "1", key_prefix(dim), "SCHEMA", "embedding", "VECTOR", "HNSW", "6", "TYPE", "FLOAT32", @@ -64,7 +71,7 @@ local function ensure_index(red, dim) return nil, "FT.CREATE failed: " .. err end - index_ready = true + index_ready[dim] = true return true end @@ -91,7 +98,7 @@ function _M.search(conf, scope_hash, embedding_vec, threshold) end local res, search_err = red["FT.SEARCH"](red, - INDEX_NAME, + index_name(#embedding_vec), query, "PARAMS", "2", "vec", binary_vec, "SORTBY", "dist", "ASC", @@ -150,7 +157,7 @@ function _M.store(conf, scope_hash, embedding_vec, text, ttl) end local binary_vec = pack_vector(embedding_vec) - local key = KEY_PREFIX .. uuid.generate_v4() + local key = key_prefix(#embedding_vec) .. uuid.generate_v4() local set_ok, set_err = red:hset(key, "embedding", binary_vec, diff --git a/t/plugin/ai-cache.t b/t/plugin/ai-cache.t index 11aed3bc2da3..e5e012828db9 100644 --- a/t/plugin/ai-cache.t +++ b/t/plugin/ai-cache.t @@ -593,6 +593,7 @@ status: 500 assert(red:connect("127.0.0.1", 6379)) red["FT.DROPINDEX"](red, "ai-cache-idx", "DD") + red["FT.DROPINDEX"](red, "ai-cache-idx-3", "DD") local keys = red:keys("ai-cache:*") if type(keys) == "table" and #keys > 0 then From f16c7e3bad1903e0b58805cdd4d5455f93bb02f0 Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Thu, 30 Apr 2026 06:56:41 +0800 Subject: [PATCH 11/38] docs(ai-cache): update doc to include Redis attr and update caveat --- docs/en/latest/plugins/ai-cache.md | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/docs/en/latest/plugins/ai-cache.md b/docs/en/latest/plugins/ai-cache.md index 71de88a32261..0571902bcedf 100644 --- a/docs/en/latest/plugins/ai-cache.md +++ b/docs/en/latest/plugins/ai-cache.md @@ -59,8 +59,16 @@ The Plugin should be used together with [ai-proxy](./ai-proxy.md) or [ai-proxy-m | `headers.cache_status` | string | False | `"X-AI-Cache-Status"` | | Response header for cache status (`HIT-L1`, `HIT-L2`, `MISS`, `BYPASS`). | | `headers.cache_age` | string | False | `"X-AI-Cache-Age"` | | Response header for the age in seconds of an exact-layer hit. | | `headers.cache_similarity` | string | False | `"X-AI-Cache-Similarity"` | | Response header for the similarity score of a semantic-layer hit. | - -Redis connection fields (`redis_host`, `redis_port`, `redis_password`, `redis_database`, `redis_timeout`, `redis_ssl`, `redis_ssl_verify`, `redis_username`, `redis_keepalive_timeout`, `redis_keepalive_pool`) follow the shared Redis schema. At minimum, `redis_host` is required. +| `redis_host` | string | True | | | The address of the Redis node. | +| `redis_port` | integer | False | `6379` | [1,...] | The port of the Redis node. | +| `redis_username` | string | False | | | The username for Redis if Redis ACL is used. If you use the legacy authentication method `requirepass`, configure only the `redis_password`. | +| `redis_password` | string | False | | | The password of the Redis node. | +| `redis_database` | integer | False | `0` | >= 0 | The database number in Redis. | +| `redis_timeout` | integer | False | `1000` | [1,...] | The Redis timeout value in milliseconds. | +| `redis_ssl` | boolean | False | `false` | | If `true`, use SSL to connect to Redis. | +| `redis_ssl_verify` | boolean | False | `false` | | If `true`, verify the server SSL certificate. | +| `redis_keepalive_timeout` | integer | False | `10000` | [1000,...] | Idle timeout in milliseconds for the Redis connection in the keepalive pool. | +| `redis_keepalive_pool` | integer | False | `100` | [1,...] | Maximum number of idle Redis connections kept in the keepalive pool. | ## Examples @@ -630,22 +638,9 @@ Two prompts that look semantically equivalent to a human can score below the con Lower the threshold to catch more paraphrases at the cost of occasionally serving a cached answer for a genuinely different question. Tune empirically against your traffic. -### Embedding model dimensions are baked into the index - -Redis Stack creates the vector index on the first request with a fixed `DIM` matching the embedding vector size (for example `1536` for `text-embedding-3-small`, `3072` for `text-embedding-3-large`). If you switch embedding models, or if the index was created with different-sized vectors during testing, subsequent requests will fail with a size-mismatch error in the APISIX warn log: - -```text -ai-cache: L2 search error: Error parsing vector similarity query: -query vector blob size (6144) does not match index's expected size (12). -``` - -The Plugin degrades to `MISS` so requests still succeed, but the semantic layer effectively stops working. Drop the index to recover; it will be recreated on the next request with the correct dimension: +### Switching embedding models is safe -```shell -docker exec redis-cli FT.DROPINDEX ai-cache-idx DD -docker exec redis-cli --raw KEYS "ai-cache:*" \ - | xargs -r docker exec -i redis-cli DEL -``` +The Plugin namespaces the L2 index and entries by embedding dimension (for example `1536` for `text-embedding-3-small`, `3072` for `text-embedding-3-large`), so changing the embedding model on a live route does not require any manual cleanup. A new index is created automatically for the new dimension; old entries from the previous model expire via the configured `semantic.ttl`. ### `BYPASS` does not refresh the cache From 087356093c67686ca9a692e2fd4895f30bd6b4e5 Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Thu, 30 Apr 2026 07:47:58 +0800 Subject: [PATCH 12/38] test(ai-cache): add cache_key scope partitioning tests in dedicated file --- apisix/plugins/ai-cache/semantic.lua | 2 +- t/plugin/ai-cache-scope.t | 355 +++++++++++++++++++++++++++ 2 files changed, 356 insertions(+), 1 deletion(-) create mode 100644 t/plugin/ai-cache-scope.t diff --git a/apisix/plugins/ai-cache/semantic.lua b/apisix/plugins/ai-cache/semantic.lua index 38055daad906..b44ee5675711 100644 --- a/apisix/plugins/ai-cache/semantic.lua +++ b/apisix/plugins/ai-cache/semantic.lua @@ -94,7 +94,7 @@ function _M.search(conf, scope_hash, embedding_vec, threshold) if scope_hash == "" then query = "*=>[KNN 1 @embedding $vec AS dist]" else - query = "@scope:{" .. scope_hash .. "} *=>[KNN 1 @embedding $vec AS dist]" + query = "@scope:{" .. scope_hash .. "}=>[KNN 1 @embedding $vec AS dist]" end local res, search_err = red["FT.SEARCH"](red, diff --git a/t/plugin/ai-cache-scope.t b/t/plugin/ai-cache-scope.t new file mode 100644 index 000000000000..97614fe173f8 --- /dev/null +++ b/t/plugin/ai-cache-scope.t @@ -0,0 +1,355 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +BEGIN { + $ENV{TEST_ENABLE_CONTROL_API_V1} = "0"; +} + +use t::APISIX 'no_plan'; + +log_level("info"); +repeat_each(1); +no_long_string(); +no_shuffle(); +no_root_location(); + +add_block_preprocessor(sub { + my ($block) = @_; + + if (!defined $block->request) { + $block->set_value("request", "GET /t"); + } + + if (!$block->error_log && !$block->no_error_log) { + $block->set_value("no_error_log", "[error]\n[alert]"); + } + + if (!defined $block->http_config) { + $block->set_value("http_config", <<_EOC_); +server { + listen 1990; + default_type 'application/json'; + + location /v1/embeddings { + content_by_lua_block { + local fixture_loader = require("lib.fixture_loader") + local content, err = fixture_loader.load("openai/embeddings-list.json") + if not content then + ngx.status = 500 + ngx.say(err) + return + end + + ngx.status = 200 + ngx.print(content) + } + } +} +_EOC_ + } +}); + +run_tests(); + +__DATA__ + +=== TEST 1: set up route with cache_key include_vars +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + [[{ + "uri": "/scoped", + "plugins": { + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer test-key" + } + }, + "override": { + "endpoint": "http://127.0.0.1:1980/v1/chat/completions" + } + }, + "ai-cache": { + "layers": ["exact"], + "exact": { "ttl": 60 }, + "cache_key": { + "include_vars": ["$http_x_tenant_id"] + }, + "redis_host": "127.0.0.1" + } + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- response_body +passed + + + +=== TEST 2: tenant-a first request - MISS +--- request +POST /scoped +{"messages":[{"role":"user","content":"scope test prompt"}]} +--- more_headers +Content-Type: application/json +X-Tenant-Id: tenant-a +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS + + + +=== TEST 3: tenant-b same prompt - MISS (proves cache_key partitioning) +--- request +POST /scoped +{"messages":[{"role":"user","content":"scope test prompt"}]} +--- more_headers +Content-Type: application/json +X-Tenant-Id: tenant-b +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS +--- wait: 1 + + + +=== TEST 4: tenant-a same prompt again - HIT-L1 +--- request +POST /scoped +{"messages":[{"role":"user","content":"scope test prompt"}]} +--- more_headers +Content-Type: application/json +X-Tenant-Id: tenant-a +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: HIT-L1 + + + +=== TEST 5: set up consumers for include_consumer test +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + + local consumers = { + { username = "alice", key = "alice-key" }, + { username = "bob", key = "bob-key" }, + } + + for _, c in ipairs(consumers) do + local code, body = t('/apisix/admin/consumers', + ngx.HTTP_PUT, + string.format([[{ + "username": "%s", + "plugins": { "key-auth": { "key": "%s" } } + }]], c.username, c.key) + ) + if code >= 300 then + ngx.status = code + ngx.say(body) + return + end + end + ngx.say("passed") + } + } +--- response_body +passed + + + +=== TEST 6: set up route with cache_key include_consumer + key-auth +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/2', + ngx.HTTP_PUT, + [[{ + "uri": "/per-consumer", + "plugins": { + "key-auth": {}, + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer test-key" + } + }, + "override": { + "endpoint": "http://127.0.0.1:1980/v1/chat/completions" + } + }, + "ai-cache": { + "layers": ["exact"], + "exact": { "ttl": 60 }, + "cache_key": { + "include_consumer": true + }, + "redis_host": "127.0.0.1" + } + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- response_body +passed + + + +=== TEST 7: alice first request - MISS +--- request +POST /per-consumer +{"messages":[{"role":"user","content":"per-consumer prompt"}]} +--- more_headers +Content-Type: application/json +apikey: alice-key +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS +--- wait: 1 + + + +=== TEST 8: bob same prompt - MISS (proves include_consumer partitioning) +--- request +POST /per-consumer +{"messages":[{"role":"user","content":"per-consumer prompt"}]} +--- more_headers +Content-Type: application/json +apikey: bob-key +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS + + + +=== TEST 9: set up route with L2 semantic + cache_key include_vars +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/3', + ngx.HTTP_PUT, + [[{ + "uri": "/scoped-semantic", + "plugins": { + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer test-key" + } + }, + "override": { + "endpoint": "http://127.0.0.1:1980/v1/chat/completions" + } + }, + "ai-cache": { + "layers": ["exact", "semantic"], + "exact": { "ttl": 60 }, + "semantic": { + "similarity_threshold": 0.90, + "ttl": 300, + "embedding": { + "provider": "openai", + "endpoint": "http://127.0.0.1:1990/v1/embeddings", + "api_key": "test-key" + } + }, + "cache_key": { + "include_vars": ["$http_x_tenant_id"] + }, + "redis_host": "127.0.0.1" + } + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- response_body +passed + + + +=== TEST 10: tenant-a first request - MISS, writes to L2 with scope=hash(tenant-a) +--- request +POST /scoped-semantic +{"messages":[{"role":"user","content":"What is the capital of France??"}]} +--- more_headers +Content-Type: application/json +X-Tenant-Id: tenant-a +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS +--- wait: 1 + + + +=== TEST 11: tenant-b same prompt - MISS (FT.SEARCH scope filter excludes tenant-a's entry) +--- request +POST /scoped-semantic +{"messages":[{"role":"user","content":"What is the capital of France??"}]} +--- more_headers +Content-Type: application/json +X-Tenant-Id: tenant-b +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS + + + +=== TEST 12: tenant-a paraphrase - HIT-L2 (scope filter finds tenant-a's entry) +--- request +POST /scoped-semantic +{"messages":[{"role":"user","content":"Name the capital city of France"}]} +--- more_headers +Content-Type: application/json +X-Tenant-Id: tenant-a +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: HIT-L2 From 9be2eccfe6a9fb64e182162a58d689f861264a8c Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Thu, 30 Apr 2026 07:58:06 +0800 Subject: [PATCH 13/38] feat(ai-cache): register plugin in default plugin list --- apisix/cli/config.lua | 1 + 1 file changed, 1 insertion(+) diff --git a/apisix/cli/config.lua b/apisix/cli/config.lua index 956eef30c267..b4df580666c0 100644 --- a/apisix/cli/config.lua +++ b/apisix/cli/config.lua @@ -231,6 +231,7 @@ local _M = { "ai-prompt-template", "ai-prompt-decorator", "ai-prompt-guard", + "ai-cache", "ai-rag", "ai-rate-limiting", "ai-proxy-multi", From d691ea28a247101c060a2eceefd8999030a1c87b Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Thu, 30 Apr 2026 08:16:32 +0800 Subject: [PATCH 14/38] fix(ai-cache): remove sort from compute_scope_hash to prevent cross-user cache collisions --- apisix/plugins/ai-cache/exact.lua | 2 -- 1 file changed, 2 deletions(-) diff --git a/apisix/plugins/ai-cache/exact.lua b/apisix/plugins/ai-cache/exact.lua index 4b4d36b157a6..a9766906508b 100644 --- a/apisix/plugins/ai-cache/exact.lua +++ b/apisix/plugins/ai-cache/exact.lua @@ -21,7 +21,6 @@ local resty_sha256 = require("resty.sha256") local to_hex = require("resty.string").to_hex local table_concat = table.concat -local table_sort = table.sort local ngx_time = ngx.time local tostring = tostring @@ -67,7 +66,6 @@ function _M.compute_scope_hash(conf, ctx) return "" end - table_sort(parts) return sha256_hex(table_concat(parts, "|")) end From 61ea49c7fb0db0760670eba7434432ebcb89fc5c Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Thu, 30 Apr 2026 08:19:08 +0800 Subject: [PATCH 15/38] feat(ai-cache): add apisix_ai_cache_embedding_failures_total metric --- apisix/plugins/ai-cache.lua | 1 + apisix/plugins/prometheus/exporter.lua | 14 ++++++++++++++ 2 files changed, 15 insertions(+) diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua index 7caf6cf548ee..f1061272d819 100644 --- a/apisix/plugins/ai-cache.lua +++ b/apisix/plugins/ai-cache.lua @@ -139,6 +139,7 @@ function _M.access(conf, ctx) local embedding, _, emb_err = emb_driver.get_embeddings(emb_conf, prompt_text, httpc, true) if not embedding then core.log.warn("ai-cache: embedding fetch failed (degrading to MISS): ", emb_err) + ctx.ai_cache_embedding_failed = true else ctx.ai_cache_embedding_latency_ms = (ngx_now() - t0) * 1000 ctx.ai_cache_embedding_provider = emb_conf.provider diff --git a/apisix/plugins/prometheus/exporter.lua b/apisix/plugins/prometheus/exporter.lua index cf008400b0f0..78ce1bac0bf5 100644 --- a/apisix/plugins/prometheus/exporter.lua +++ b/apisix/plugins/prometheus/exporter.lua @@ -166,6 +166,8 @@ function _M.http_init(prometheus_enabled_in_stream) "ai_cache_misses", "expire") local ai_cache_embedding_latency_exptime = core.table.try_read_attr(attr, "metrics", "ai_cache_embedding_latency", "expire") + local ai_cache_embedding_failures_exptime = core.table.try_read_attr(attr, "metrics", + "ai_cache_embedding_failures", "expire") prometheus = base_prometheus.init("prometheus-metrics", metric_prefix) @@ -289,6 +291,12 @@ function _M.http_init(prometheus_enabled_in_stream) ai_cache_embedding_latency_buckets, ai_cache_embedding_latency_exptime) + metrics.ai_cache_embedding_failures = prometheus:counter("ai_cache_embedding_failures_total", + "AI cache embedding API call failure count", + {"route_id", "service_id", "consumer", + unpack(extra_labels("ai_cache_embedding_failures"))}, + ai_cache_embedding_failures_exptime) + if prometheus_enabled_in_stream then init_stream_metrics() end @@ -428,6 +436,12 @@ function _M.http_log(conf, ctx) ctx.ai_cache_embedding_provider or "", unpack(extra_labels("ai_cache_embedding_latency", ctx)))) end + + if ctx.ai_cache_embedding_failed then + metrics.ai_cache_embedding_failures:inc(1, + gen_arr(route_id, service_id, consumer_name, + unpack(extra_labels("ai_cache_embedding_failures", ctx)))) + end end end From bf31bc83245648633e3382469577d6240acc4f6d Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Thu, 30 Apr 2026 08:28:48 +0800 Subject: [PATCH 16/38] docs(ai-cache): add Ingress Controller tabs to all examples --- docs/en/latest/plugins/ai-cache.md | 432 +++++++++++++++++++++++++++++ 1 file changed, 432 insertions(+) diff --git a/docs/en/latest/plugins/ai-cache.md b/docs/en/latest/plugins/ai-cache.md index 0571902bcedf..2831340ebd8d 100644 --- a/docs/en/latest/plugins/ai-cache.md +++ b/docs/en/latest/plugins/ai-cache.md @@ -162,6 +162,109 @@ Synchronize the configuration to the gateway: adc sync -f adc.yaml ``` + + + + + + +Create a Route with the `ai-cache` and [ai-proxy](./ai-proxy.md) Plugins configured as such: + +```yaml title="ai-cache-ic.yaml" +apiVersion: apisix.apache.org/v1alpha1 +kind: PluginConfig +metadata: + namespace: aic + name: ai-cache-plugin-config +spec: + plugins: + - name: ai-proxy + config: + provider: openai + auth: + header: + Authorization: "Bearer your-api-key" + options: + model: gpt-4 + - name: ai-cache + config: + layers: + - exact + exact: + ttl: 3600 + redis_host: redis-stack +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + namespace: aic + name: ai-cache-route +spec: + parentRefs: + - name: apisix + rules: + - matches: + - path: + type: Exact + value: /anything + method: POST + filters: + - type: ExtensionRef + extensionRef: + group: apisix.apache.org + kind: PluginConfig + name: ai-cache-plugin-config +``` + + + + +Create a Route with the `ai-cache` and [ai-proxy](./ai-proxy.md) Plugins configured as such: + +```yaml title="ai-cache-ic.yaml" +apiVersion: apisix.apache.org/v2 +kind: ApisixRoute +metadata: + namespace: aic + name: ai-cache-route +spec: + ingressClassName: apisix + http: + - name: ai-cache-route + match: + paths: + - /anything + methods: + - POST + plugins: + - name: ai-proxy + enable: true + config: + provider: openai + auth: + header: + Authorization: "Bearer your-api-key" + options: + model: gpt-4 + - name: ai-cache + enable: true + config: + layers: + - exact + exact: + ttl: 3600 + redis_host: redis-stack +``` + + + + +Apply the configuration to your cluster: + +```shell +kubectl apply -f ai-cache-ic.yaml +``` + @@ -334,6 +437,123 @@ Synchronize the configuration to the gateway: adc sync -f adc.yaml ``` + + + + + + +```yaml title="ai-cache-ic.yaml" +apiVersion: apisix.apache.org/v1alpha1 +kind: PluginConfig +metadata: + namespace: aic + name: ai-cache-plugin-config +spec: + plugins: + - name: ai-proxy + config: + provider: openai + auth: + header: + Authorization: "Bearer your-api-key" + options: + model: gpt-4 + - name: ai-cache + config: + layers: + - exact + - semantic + exact: + ttl: 3600 + semantic: + similarity_threshold: 0.92 + ttl: 86400 + embedding: + provider: openai + endpoint: https://api.openai.com/v1/embeddings + api_key: your-api-key + model: text-embedding-3-small + redis_host: redis-stack +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + namespace: aic + name: ai-cache-route +spec: + parentRefs: + - name: apisix + rules: + - matches: + - path: + type: Exact + value: /anything + method: POST + filters: + - type: ExtensionRef + extensionRef: + group: apisix.apache.org + kind: PluginConfig + name: ai-cache-plugin-config +``` + + + + +```yaml title="ai-cache-ic.yaml" +apiVersion: apisix.apache.org/v2 +kind: ApisixRoute +metadata: + namespace: aic + name: ai-cache-route +spec: + ingressClassName: apisix + http: + - name: ai-cache-route + match: + paths: + - /anything + methods: + - POST + plugins: + - name: ai-proxy + enable: true + config: + provider: openai + auth: + header: + Authorization: "Bearer your-api-key" + options: + model: gpt-4 + - name: ai-cache + enable: true + config: + layers: + - exact + - semantic + exact: + ttl: 3600 + semantic: + similarity_threshold: 0.92 + ttl: 86400 + embedding: + provider: openai + endpoint: https://api.openai.com/v1/embeddings + api_key: your-api-key + model: text-embedding-3-small + redis_host: redis-stack +``` + + + + +Apply the configuration to your cluster: + +```shell +kubectl apply -f ai-cache-ic.yaml +``` + @@ -490,6 +710,113 @@ Synchronize the configuration to the gateway: adc sync -f adc.yaml ``` + + + + + + +```yaml title="ai-cache-ic.yaml" +apiVersion: apisix.apache.org/v1alpha1 +kind: PluginConfig +metadata: + namespace: aic + name: ai-cache-plugin-config +spec: + plugins: + - name: ai-proxy + config: + provider: openai + auth: + header: + Authorization: "Bearer your-api-key" + options: + model: gpt-4 + - name: ai-cache + config: + layers: + - exact + exact: + ttl: 3600 + cache_key: + include_consumer: true + include_vars: + - "$http_x_tenant_id" + redis_host: redis-stack +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + namespace: aic + name: ai-cache-route +spec: + parentRefs: + - name: apisix + rules: + - matches: + - path: + type: Exact + value: /anything + method: POST + filters: + - type: ExtensionRef + extensionRef: + group: apisix.apache.org + kind: PluginConfig + name: ai-cache-plugin-config +``` + + + + +```yaml title="ai-cache-ic.yaml" +apiVersion: apisix.apache.org/v2 +kind: ApisixRoute +metadata: + namespace: aic + name: ai-cache-route +spec: + ingressClassName: apisix + http: + - name: ai-cache-route + match: + paths: + - /anything + methods: + - POST + plugins: + - name: ai-proxy + enable: true + config: + provider: openai + auth: + header: + Authorization: "Bearer your-api-key" + options: + model: gpt-4 + - name: ai-cache + enable: true + config: + layers: + - exact + exact: + ttl: 3600 + cache_key: + include_consumer: true + include_vars: + - "$http_x_tenant_id" + redis_host: redis-stack +``` + + + + +Apply the configuration to your cluster: + +```shell +kubectl apply -f ai-cache-ic.yaml +``` + @@ -569,6 +896,111 @@ Synchronize the configuration to the gateway: adc sync -f adc.yaml ``` + + + + + + +```yaml title="ai-cache-ic.yaml" +apiVersion: apisix.apache.org/v1alpha1 +kind: PluginConfig +metadata: + namespace: aic + name: ai-cache-plugin-config +spec: + plugins: + - name: ai-proxy + config: + provider: openai + auth: + header: + Authorization: "Bearer your-api-key" + options: + model: gpt-4 + - name: ai-cache + config: + layers: + - exact + exact: + ttl: 3600 + bypass_on: + - header: X-Cache-Bypass + equals: "1" + redis_host: redis-stack +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + namespace: aic + name: ai-cache-route +spec: + parentRefs: + - name: apisix + rules: + - matches: + - path: + type: Exact + value: /anything + method: POST + filters: + - type: ExtensionRef + extensionRef: + group: apisix.apache.org + kind: PluginConfig + name: ai-cache-plugin-config +``` + + + + +```yaml title="ai-cache-ic.yaml" +apiVersion: apisix.apache.org/v2 +kind: ApisixRoute +metadata: + namespace: aic + name: ai-cache-route +spec: + ingressClassName: apisix + http: + - name: ai-cache-route + match: + paths: + - /anything + methods: + - POST + plugins: + - name: ai-proxy + enable: true + config: + provider: openai + auth: + header: + Authorization: "Bearer your-api-key" + options: + model: gpt-4 + - name: ai-cache + enable: true + config: + layers: + - exact + exact: + ttl: 3600 + bypass_on: + - header: X-Cache-Bypass + equals: "1" + redis_host: redis-stack +``` + + + + +Apply the configuration to your cluster: + +```shell +kubectl apply -f ai-cache-ic.yaml +``` + From 803f741aba356c44e2c3856d2971ba5d2fb4583e Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Thu, 30 Apr 2026 08:33:34 +0800 Subject: [PATCH 17/38] test(ai-cache): add multi-rule bypass_on test cases --- t/plugin/ai-cache.t | 105 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 89 insertions(+), 16 deletions(-) diff --git a/t/plugin/ai-cache.t b/t/plugin/ai-cache.t index e5e012828db9..5b3a6f654c6c 100644 --- a/t/plugin/ai-cache.t +++ b/t/plugin/ai-cache.t @@ -323,7 +323,80 @@ X-AI-Cache-Status: MISS -=== TEST 12: set up route for 4xx test +=== TEST 12: set up route with two bypass rules +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + [[{ + "uri": "/chat", + "plugins": { + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer test-key" + } + }, + "override": { + "endpoint": "http://127.0.0.1:1980/v1/chat/completions" + } + }, + "ai-cache": { + "layers": ["exact"], + "exact": { "ttl": 60 }, + "redis_host": "127.0.0.1", + "bypass_on": [ + {"header": "X-Cache-Bypass", "equals": "1"}, + {"header": "X-Debug", "equals": "true"} + ] + } + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- response_body +passed + + + +=== TEST 13: first bypass rule matches - BYPASS +--- request +POST /chat +{"messages":[{"role":"user","content":"multi-rule bypass test"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +X-Cache-Bypass: 1 +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: BYPASS + + + +=== TEST 14: second bypass rule matches - BYPASS +--- request +POST /chat +{"messages":[{"role":"user","content":"multi-rule bypass test"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +X-Debug: true +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: BYPASS + + + +=== TEST 15: set up route for 4xx test --- config location /t { content_by_lua_block { @@ -364,7 +437,7 @@ passed -=== TEST 13: 4xx from upstream - not cached +=== TEST 16: 4xx from upstream - not cached --- request POST /error {"messages":[{"role":"user","content":"trigger an error please"}]} @@ -378,7 +451,7 @@ X-AI-Cache-Status: MISS -=== TEST 14: same prompt after 4xx - still MISS (4xx was not cached) +=== TEST 17: same prompt after 4xx - still MISS (4xx was not cached) --- request POST /error {"messages":[{"role":"user","content":"trigger an error please"}]} @@ -392,7 +465,7 @@ X-AI-Cache-Status: MISS -=== TEST 15: openai driver - parses embedding vector correctly +=== TEST 18: openai driver - parses embedding vector correctly --- http_config server { listen 1990; @@ -453,7 +526,7 @@ ok: 0.1 0.2 0.3 -=== TEST 16: openai driver - 429 from API return nil with status +=== TEST 19: openai driver - 429 from API return nil with status --- http_config server { listen 1990; @@ -492,7 +565,7 @@ status: 429 -=== TEST 17: azure_openai driver - parses embedding vector correctly +=== TEST 20: azure_openai driver - parses embedding vector correctly --- http_config server { listen 1990; @@ -544,7 +617,7 @@ ok: 0.4 0.5 0.6 -=== TEST 18: openai driver - 500 from API returns nil with status +=== TEST 21: openai driver - 500 from API returns nil with status --- http_config server { listen 1990; @@ -583,7 +656,7 @@ status: 500 -=== TEST 19: clean up L2 state before semantic tests +=== TEST 22: clean up L2 state before semantic tests --- config location /t { content_by_lua_block { @@ -609,7 +682,7 @@ ok -=== TEST 20: set up route for L2 semantic cache tests +=== TEST 23: set up route for L2 semantic cache tests --- config location /t { content_by_lua_block { @@ -660,7 +733,7 @@ passed -=== TEST 21: L2 - first request, cache MISS, stored in L2 +=== TEST 24: L2 - first request, cache MISS, stored in L2 --- request POST /semantic {"messages":[{"role":"user","content":"What is the capital of France??"}]} @@ -673,7 +746,7 @@ X-AI-Cache-Status: MISS -=== TEST 22: L2 - different wording hits L2 (same vector from fixture) +=== TEST 25: L2 - different wording hits L2 (same vector from fixture) --- request POST /semantic {"messages":[{"role":"user","content":"Name the capital city of France"}]} @@ -690,7 +763,7 @@ ai-cache: L2 hit -=== TEST 23: L2 - original prompt now hits L1 (backfilled by the L2 hit) +=== TEST 26: L2 - original prompt now hits L1 (backfilled by the L2 hit) --- request POST /semantic {"messages":[{"role":"user","content":"What is the capital of France??"}]} @@ -705,7 +778,7 @@ ai-cache: L1 hit for key -=== TEST 24: L2 degradation - search error results in MISS, not 500 +=== TEST 27: L2 degradation - search error results in MISS, not 500 --- config location /t { content_by_lua_block { @@ -729,7 +802,7 @@ qr/degraded gracefully|miss, no error/ -=== TEST 25: streaming MISS - upstream called, response cached via log phase +=== TEST 28: streaming MISS - upstream called, response cached via log phase --- request POST /chat {"messages":[{"role":"user","content":"Stream me something cool"}],"stream":true} @@ -742,7 +815,7 @@ X-AI-Cache-Status: MISS -=== TEST 26: streaming HIT - Content-Type is text/event-stream, SSE body returned +=== TEST 29: streaming HIT - Content-Type is text/event-stream, SSE body returned --- request POST /chat {"messages":[{"role":"user","content":"Stream me something cool"}],"stream":true} @@ -758,7 +831,7 @@ qr/data:.*content/ -=== TEST 27: non-streaming HIT after streaming MISS - returns JSON +=== TEST 30: non-streaming HIT after streaming MISS - returns JSON --- request POST /chat {"messages":[{"role":"user","content":"Stream me something cool"}]} From 0c268703f99f7f83d08d98d51029b39ccd584458 Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Thu, 30 Apr 2026 08:35:24 +0800 Subject: [PATCH 18/38] docs(ai-cache): note that bypass header is unauthenticated and should be gated --- docs/en/latest/plugins/ai-cache.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/en/latest/plugins/ai-cache.md b/docs/en/latest/plugins/ai-cache.md index 2831340ebd8d..9bd98b4f5f57 100644 --- a/docs/en/latest/plugins/ai-cache.md +++ b/docs/en/latest/plugins/ai-cache.md @@ -1078,6 +1078,8 @@ The Plugin namespaces the L2 index and entries by embedding dimension (for examp A request with the bypass header reaches the upstream but its response is not written back. Use it to force a fresh upstream call without invalidating or replacing the existing cached entry. +The bypass header is not authenticated — any client that can set the configured header and value can bypass the cache. In production, gate access using an APISIX authentication plugin such as `key-auth` or `ip-restriction`, or restrict the header at your upstream WAF. + ### The semantic layer requires Redis Stack The `FT.CREATE` and `FT.SEARCH` commands used by the semantic layer come from the RediSearch module. Vanilla Redis will fail these commands and the layer will silently degrade to `MISS`. Use a Redis Stack image such as `redis/redis-stack:latest`. From c345fe01c7111f5b15da156d9a87470bc450860a Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Fri, 1 May 2026 00:07:00 +0800 Subject: [PATCH 19/38] chore(ai-cache): fix CI failures across lint, eclint, misc-checker, and admin tests --- Makefile | 5 +++++ apisix/plugins/ai-cache.lua | 17 ++++++++++------- .../ai-cache/embeddings/azure_openai.lua | 4 +++- apisix/plugins/ai-cache/embeddings/openai.lua | 4 +++- apisix/plugins/ai-cache/exact.lua | 4 +++- apisix/plugins/ai-cache/schema.lua | 2 +- t/admin/plugins.t | 1 + t/plugin/ai-cache.t | 16 ++++++++-------- 8 files changed, 34 insertions(+), 19 deletions(-) diff --git a/Makefile b/Makefile index 71ab7df1eabf..ad074ecabbdf 100644 --- a/Makefile +++ b/Makefile @@ -388,6 +388,11 @@ install: runtime $(ENV_INSTALL) -d $(ENV_INST_LUADIR)/apisix/plugins/ai-transport $(ENV_INSTALL) apisix/plugins/ai-transport/*.lua $(ENV_INST_LUADIR)/apisix/plugins/ai-transport + $(ENV_INSTALL) -d $(ENV_INST_LUADIR)/apisix/plugins/ai-cache + $(ENV_INSTALL) apisix/plugins/ai-cache/*.lua $(ENV_INST_LUADIR)/apisix/plugins/ai-cache + $(ENV_INSTALL) -d $(ENV_INST_LUADIR)/apisix/plugins/ai-cache/embeddings + $(ENV_INSTALL) apisix/plugins/ai-cache/embeddings/*.lua $(ENV_INST_LUADIR)/apisix/plugins/ai-cache/embeddings + $(ENV_INSTALL) -d $(ENV_INST_LUADIR)/apisix/plugins/ai-rag/embeddings $(ENV_INSTALL) apisix/plugins/ai-rag/embeddings/*.lua $(ENV_INST_LUADIR)/apisix/plugins/ai-rag/embeddings $(ENV_INSTALL) -d $(ENV_INST_LUADIR)/apisix/plugins/ai-rag/vector-search diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua index f1061272d819..2c8942a9f791 100644 --- a/apisix/plugins/ai-cache.lua +++ b/apisix/plugins/ai-cache.lua @@ -21,9 +21,12 @@ local exact = require("apisix.plugins.ai-cache.exact") local semantic = require("apisix.plugins.ai-cache.semantic") local protocols = require("apisix.plugins.ai-protocols") local http = require("resty.http") -local ngx_time = ngx.time -local ngx_now = ngx.now -local tostring = tostring +local ngx = ngx +local ngx_time = ngx.time +local ngx_now = ngx.now +local ipairs = ipairs +local require = require +local tostring = tostring local table_concat = table.concat local plugin_name = "ai-cache" @@ -122,10 +125,10 @@ function _M.access(conf, ctx) else core.response.set_header("Content-Type", "application/json") end - return core.response.exit(200, proto.build_deny_response({ + return 200, proto.build_deny_response({ stream = is_stream, text = cached_text, - })) + }) end end @@ -169,10 +172,10 @@ function _M.access(conf, ctx) else core.response.set_header("Content-Type", "application/json") end - return core.response.exit(200, proto.build_deny_response({ + return 200, proto.build_deny_response({ stream = is_stream, text = cached_text, - })) + }) end end end diff --git a/apisix/plugins/ai-cache/embeddings/azure_openai.lua b/apisix/plugins/ai-cache/embeddings/azure_openai.lua index a52c5e265497..928a803f948e 100644 --- a/apisix/plugins/ai-cache/embeddings/azure_openai.lua +++ b/apisix/plugins/ai-cache/embeddings/azure_openai.lua @@ -16,8 +16,10 @@ -- local core = require("apisix.core") +local type = type -local HTTP_OK = ngx.HTTP_OK +local ngx = ngx +local HTTP_OK = ngx.HTTP_OK local HTTP_INTERNAL_SERVER_ERROR = ngx.HTTP_INTERNAL_SERVER_ERROR local _M = {} diff --git a/apisix/plugins/ai-cache/embeddings/openai.lua b/apisix/plugins/ai-cache/embeddings/openai.lua index ff50f2bbea27..0ca8c7cd61af 100644 --- a/apisix/plugins/ai-cache/embeddings/openai.lua +++ b/apisix/plugins/ai-cache/embeddings/openai.lua @@ -16,8 +16,10 @@ -- local core = require("apisix.core") +local type = type -local HTTP_OK = ngx.HTTP_OK +local ngx = ngx +local HTTP_OK = ngx.HTTP_OK local HTTP_INTERNAL_SERVER_ERROR = ngx.HTTP_INTERNAL_SERVER_ERROR local _M = {} diff --git a/apisix/plugins/ai-cache/exact.lua b/apisix/plugins/ai-cache/exact.lua index a9766906508b..d442cd89c467 100644 --- a/apisix/plugins/ai-cache/exact.lua +++ b/apisix/plugins/ai-cache/exact.lua @@ -20,9 +20,11 @@ local redis = require("apisix.utils.redis") local resty_sha256 = require("resty.sha256") local to_hex = require("resty.string").to_hex -local table_concat = table.concat +local ngx = ngx local ngx_time = ngx.time +local ipairs = ipairs local tostring = tostring +local table_concat = table.concat local KEY_PREFIX = "ai-cache:l1:" diff --git a/apisix/plugins/ai-cache/schema.lua b/apisix/plugins/ai-cache/schema.lua index 444858cc9067..9416af97f638 100644 --- a/apisix/plugins/ai-cache/schema.lua +++ b/apisix/plugins/ai-cache/schema.lua @@ -124,4 +124,4 @@ _M.schema = { encrypt_fields = { "semantic.embedding.api_key", "redis_password" }, } -return _M \ No newline at end of file +return _M diff --git a/t/admin/plugins.t b/t/admin/plugins.t index adb98b28bc17..1454ec145eb0 100644 --- a/t/admin/plugins.t +++ b/t/admin/plugins.t @@ -98,6 +98,7 @@ ai-request-rewrite ai-prompt-guard ai-prompt-template ai-prompt-decorator +ai-cache ai-rag ai-aws-content-moderation ai-proxy-multi diff --git a/t/plugin/ai-cache.t b/t/plugin/ai-cache.t index 5b3a6f654c6c..1657fbec3d76 100644 --- a/t/plugin/ai-cache.t +++ b/t/plugin/ai-cache.t @@ -1,18 +1,18 @@ # -# Licensed to the Apache Software Foundation (ASF) under one or more +# Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 +# The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# +# the License. You may obtain a copy of the License at +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, +# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. +# limitations under the License. # BEGIN { @@ -83,7 +83,7 @@ __DATA__ ngx.say("failed") else ngx.say("passed") - end + end } } --- response_body From e665d2a4830c1076e49642764c55d47e009e321b Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Fri, 1 May 2026 05:06:56 +0800 Subject: [PATCH 20/38] docs(ai-cache): document top_k and fix HTTPS endpoint wording --- docs/en/latest/plugins/ai-cache.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/en/latest/plugins/ai-cache.md b/docs/en/latest/plugins/ai-cache.md index 9bd98b4f5f57..3c1789131134 100644 --- a/docs/en/latest/plugins/ai-cache.md +++ b/docs/en/latest/plugins/ai-cache.md @@ -47,9 +47,10 @@ The Plugin should be used together with [ai-proxy](./ai-proxy.md) or [ai-proxy-m | `layers` | array[string] | False | `["exact", "semantic"]` | `"exact"`, `"semantic"` | Cache layers to enable, queried in order. | | `exact.ttl` | integer | False | `3600` | ≥ 1 | Time-to-live in seconds for exact-layer entries. | | `semantic.similarity_threshold` | number | False | `0.95` | 0–1 | Minimum cosine similarity required for a semantic-layer hit. | +| `semantic.top_k` | integer | False | `1` | ≥ 1 | Number of nearest-neighbor candidates the index returns; the first candidate above `similarity_threshold` is used. | | `semantic.ttl` | integer | False | `86400` | ≥ 1 | Time-to-live in seconds for semantic-layer entries. | | `semantic.embedding.provider` | string | True (if semantic enabled) | | `"openai"`, `"azure_openai"` | Embedding API provider. | -| `semantic.embedding.endpoint` | string | True (if semantic enabled) | | | HTTPS URL of the embedding API. | +| `semantic.embedding.endpoint` | string | True (if semantic enabled) | | | Embedding API endpoint URL. | | `semantic.embedding.api_key` | string | True (if semantic enabled) | | | API key for the embedding provider. Stored encrypted. | | `semantic.embedding.model` | string | False | | | Embedding model name. Uses provider default if omitted. | | `cache_key.include_consumer` | boolean | False | `false` | | If `true`, partition the cache by consumer name. | From 6b996cd9de2988ff83790b02176b340d59156178 Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Fri, 1 May 2026 05:12:30 +0800 Subject: [PATCH 21/38] feat(ai-cache): wire semantic.top_k through L2 vector search --- apisix/plugins/ai-cache/semantic.lua | 54 +++++++++++++++------------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/apisix/plugins/ai-cache/semantic.lua b/apisix/plugins/ai-cache/semantic.lua index b44ee5675711..4d4cf1c8dedd 100644 --- a/apisix/plugins/ai-cache/semantic.lua +++ b/apisix/plugins/ai-cache/semantic.lua @@ -89,12 +89,15 @@ function _M.search(conf, scope_hash, embedding_vec, threshold) end local binary_vec = pack_vector(embedding_vec) + local top_k = (conf.semantic and conf.semantic.top_k) or 1 + local top_k_str = tostring(top_k) local query if scope_hash == "" then - query = "*=>[KNN 1 @embedding $vec AS dist]" + query = "*=>[KNN " .. top_k_str .. " @embedding $vec AS dist]" else - query = "@scope:{" .. scope_hash .. "}=>[KNN 1 @embedding $vec AS dist]" + query = "@scope:{" .. scope_hash .. "}=>[KNN " .. top_k_str + .. " @embedding $vec AS dist]" end local res, search_err = red["FT.SEARCH"](red, @@ -102,7 +105,7 @@ function _M.search(conf, scope_hash, embedding_vec, threshold) query, "PARAMS", "2", "vec", binary_vec, "SORTBY", "dist", "ASC", - "LIMIT", "0", "1", + "LIMIT", "0", top_k_str, "RETURN", "2", "response", "dist", "DIALECT", "2" ) @@ -116,31 +119,32 @@ function _M.search(conf, scope_hash, embedding_vec, threshold) return nil, nil, nil end - -- RESP2: {count, key, {field, val, field, val, ...}, ...} - local fields = res[3] - if type(fields) ~= "table" then - return nil, nil, nil - end - - local response_text, dist - for i = 1, #fields, 2 do - if fields[i] == "response" then - response_text = fields[i + 1] - elseif fields[i] == "dist" then - dist = tonumber(fields[i + 1]) + -- RESP2: {count, key1, fields1, key2, fields2, ...} + -- Results are sorted by dist ASC. Iterate candidates and return the first + -- one whose similarity meets the threshold; skip candidates with missing + -- or corrupt fields. + for i = 3, #res, 2 do + local fields = res[i] + if type(fields) == "table" then + local response_text, dist + for j = 1, #fields, 2 do + if fields[j] == "response" then + response_text = fields[j + 1] + elseif fields[j] == "dist" then + dist = tonumber(fields[j + 1]) + end + end + + if response_text and dist then + local similarity = 1 - dist + if similarity >= threshold then + return response_text, similarity, nil + end + end end end - if not response_text or not dist then - return nil, nil, nil - end - - local similarity = 1 - dist - if similarity < threshold then - return nil, nil, nil - end - - return response_text, similarity, nil + return nil, nil, nil end From 57687ca16155a7289c4608f0392ef7f450658c69 Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Fri, 1 May 2026 05:18:07 +0800 Subject: [PATCH 22/38] fix(ai-cache): guard log() against nil cache key fields on early-MISS --- apisix/plugins/ai-cache.lua | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua index 2c8942a9f791..cbfd37cbd846 100644 --- a/apisix/plugins/ai-cache.lua +++ b/apisix/plugins/ai-cache.lua @@ -215,6 +215,12 @@ function _M.log(conf, ctx) return end + -- Early-MISS paths (body parse / protocol detect / hash failure) skip + -- key computation, so bail out if cache key fields are absent. + if not ctx.ai_cache_prompt_hash or not ctx.ai_cache_prompt_text then + return + end + local upstream_status = core.response.get_upstream_status(ctx) or ngx.status if not upstream_status or upstream_status < 200 or upstream_status >= 300 then return From 1e42c8672748c04dae1468f14e26d078b4bf91f4 Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Fri, 1 May 2026 06:41:47 +0800 Subject: [PATCH 23/38] docs(ai-cache): mark build_deny_response usage as a follow-up rename --- apisix/plugins/ai-cache.lua | 3 +++ 1 file changed, 3 insertions(+) diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua index cbfd37cbd846..a095081f9f20 100644 --- a/apisix/plugins/ai-cache.lua +++ b/apisix/plugins/ai-cache.lua @@ -125,6 +125,9 @@ function _M.access(conf, ctx) else core.response.set_header("Content-Type", "application/json") end + -- TODO: rename build_deny_response to build_response_from_text in a + -- follow-up. We use it here to wrap cached text in the protocol's + -- response shape, not for policy denial. return 200, proto.build_deny_response({ stream = is_stream, text = cached_text, From 3349acc060552151e9bb278636018cf8744acb2f Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Tue, 5 May 2026 02:10:44 +0800 Subject: [PATCH 24/38] test(prometheus): expand ai-cache metric coverage and reorganize tests --- t/plugin/prometheus-ai-cache.t | 188 +++++++++++++++++++++++++++++++-- 1 file changed, 178 insertions(+), 10 deletions(-) diff --git a/t/plugin/prometheus-ai-cache.t b/t/plugin/prometheus-ai-cache.t index fc4eb02264bc..02544fe97df7 100644 --- a/t/plugin/prometheus-ai-cache.t +++ b/t/plugin/prometheus-ai-cache.t @@ -49,6 +49,7 @@ plugins: - ai-cache - prometheus - public-api + - key-auth _EOC_ $block->set_value("extra_yaml_config", $user_yaml_config); @@ -72,6 +73,13 @@ server { ngx.print(content) } } + + location /v1/embeddings-fail { + content_by_lua_block { + ngx.status = 500 + ngx.say('{"error":"simulated embedding failure"}') + } + } } _EOC_ } @@ -91,7 +99,7 @@ __DATA__ { url = "/apisix/admin/routes/1", data = [[{ - "uri": "/chat", + "uri": "/exact", "plugins": { "prometheus": {}, "ai-proxy": { @@ -148,6 +156,76 @@ __DATA__ } }]], }, + { + url = "/apisix/admin/routes/3", + data = [[{ + "uri": "/semantic-fail", + "plugins": { + "prometheus": {}, + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer test-key" + } + }, + "override": { + "endpoint": "http://127.0.0.1:1980/v1/chat/completions" + } + }, + "ai-cache": { + "layers": ["semantic"], + "semantic": { + "similarity_threshold": 0.90, + "ttl": 300, + "embedding": { + "provider": "openai", + "endpoint": "http://127.0.0.1:1990/v1/embeddings-fail", + "api_key": "test-key" + } + }, + "redis_host": "127.0.0.1" + } + } + }]], + }, + { + url = "/apisix/admin/routes/4", + data = [[{ + "uri": "/exact-auth", + "plugins": { + "prometheus": {}, + "key-auth": {}, + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer test-key" + } + }, + "override": { + "endpoint": "http://127.0.0.1:1980/v1/chat/completions" + } + }, + "ai-cache": { + "layers": ["exact"], + "exact": { "ttl": 60 }, + "redis_host": "127.0.0.1" + } + } + }]], + }, + { + url = "/apisix/admin/consumers", + data = [[{ + "username": "alice", + "plugins": { + "key-auth": { + "key": "alice-key" + } + } + }]], + }, { url = "/apisix/admin/routes/metrics", data = [[{ @@ -169,13 +247,13 @@ __DATA__ } } --- response_body eval -"passed\n" x 3 +"passed\n" x 6 === TEST 2: MISS request - upstream called --- request -POST /chat +POST /exact {"messages":[{"role":"user","content":"What is the meaning of life?"}]} --- more_headers Content-Type: application/json @@ -188,7 +266,7 @@ X-AI-Cache-Status: MISS === TEST 3: same request - HIT-L1 --- request -POST /chat +POST /exact {"messages":[{"role":"user","content":"What is the meaning of life?"}]} --- more_headers Content-Type: application/json @@ -217,7 +295,7 @@ qr/apisix_ai_cache_hits_total\{route_id="1",service_id="",consumer="",layer="l1" === TEST 6: BYPASS request - upstream called, no cache interaction --- request -POST /chat +POST /exact {"messages":[{"role":"user","content":"What is the meaning of life?"}]} --- more_headers Content-Type: application/json @@ -237,7 +315,15 @@ qr/apisix_ai_cache_misses_total\{route_id="1",service_id="",consumer=""\} 1\n/ -=== TEST 8: cleanup Redis L2 state before semantic tests +=== TEST 8: verify BYPASS did not increment hits counter +--- request +GET /apisix/prometheus/metrics +--- response_body_like eval +qr/apisix_ai_cache_hits_total\{route_id="1",service_id="",consumer="",layer="l1"\} 1\n/ + + + +=== TEST 9: cleanup Redis L2 state before semantic tests --- config location /t { content_by_lua_block { @@ -262,7 +348,7 @@ ok -=== TEST 9: L2 first request - MISS, embedding API called +=== TEST 10: L2 first request - MISS, embedding API called --- request POST /semantic {"messages":[{"role":"user","content":"What is the capital of France??"}]} @@ -276,7 +362,7 @@ X-AI-Cache-Status: MISS -=== TEST 10: L2 second request - different wording, HIT-L2 +=== TEST 11: L2 second request - different wording, HIT-L2 --- request POST /semantic {"messages":[{"role":"user","content":"Name the capital city of France"}]} @@ -290,7 +376,15 @@ X-AI-Cache-Status: HIT-L2 -=== TEST 11: verify hits counter with layer="l2" +=== TEST 12: verify miss counter for semantic route (route_id=2) +--- request +GET /apisix/prometheus/metrics +--- response_body_like eval +qr/apisix_ai_cache_misses_total\{route_id="2",service_id="",consumer=""\} 1/ + + + +=== TEST 13: verify hits counter with layer="l2" --- request GET /apisix/prometheus/metrics --- response_body_like eval @@ -298,8 +392,82 @@ qr/apisix_ai_cache_hits_total\{route_id="2",service_id="",consumer="",layer="l2" -=== TEST 12: verify embedding latency histogram with provider label +=== TEST 14: verify embedding latency histogram with provider label --- request GET /apisix/prometheus/metrics --- response_body_like eval qr/apisix_ai_cache_embedding_latency_count\{route_id="2",service_id="",consumer="",provider="openai"\} 2/ + + + +=== TEST 15: embedding failure - request still returns 200 via fallback +--- request +POST /semantic-fail +{"messages":[{"role":"user","content":"What does this fail at?"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS +--- wait: 1 + + + +=== TEST 16: verify embedding_failures counter +--- request +GET /apisix/prometheus/metrics +--- response_body_like eval +qr/apisix_ai_cache_embedding_failures_total\{route_id="3",service_id="",consumer=""\} 1/ + + + +=== TEST 17: verify embedding-failure request also counted as miss +--- request +GET /apisix/prometheus/metrics +--- response_body_like eval +qr/apisix_ai_cache_misses_total\{route_id="3",service_id="",consumer=""\} 1/ + + + +=== TEST 18: authenticated MISS request - consumer alice +--- request +POST /exact-auth +{"messages":[{"role":"user","content":"Authenticated cache test"}]} +--- more_headers +Content-Type: application/json +apikey: alice-key +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS + + + +=== TEST 19: authenticated HIT-L1 request - consumer alice +--- request +POST /exact-auth +{"messages":[{"role":"user","content":"Authenticated cache test"}]} +--- more_headers +Content-Type: application/json +apikey: alice-key +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: HIT-L1 +--- wait: 1 + + + +=== TEST 20: verify consumer label is populated on hits counter +--- request +GET /apisix/prometheus/metrics +--- response_body_like eval +qr/apisix_ai_cache_hits_total\{route_id="4",service_id="",consumer="alice",layer="l1"\} 1/ + + + +=== TEST 21: verify consumer label is populated on misses counter +--- request +GET /apisix/prometheus/metrics +--- response_body_like eval +qr/apisix_ai_cache_misses_total\{route_id="4",service_id="",consumer="alice"\} 1/ From 46ad8a65e98c11dc2ce69ae28383f30017b68465 Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Tue, 5 May 2026 02:54:39 +0800 Subject: [PATCH 25/38] fix(ai-cache): populate AI ctx fields on cache hit --- apisix/plugins/ai-cache.lua | 13 +++++++++++++ t/plugin/prometheus-ai-cache.t | 8 ++++++++ 2 files changed, 21 insertions(+) diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua index a095081f9f20..69f24bec9bfb 100644 --- a/apisix/plugins/ai-cache.lua +++ b/apisix/plugins/ai-cache.lua @@ -48,6 +48,17 @@ local function layer_enabled(conf, name) end +local function populate_ai_ctx_on_hit(ctx, protocol_name, body_tab, is_stream, cached_text) + ctx.ai_client_protocol = protocol_name + ctx.var.request_type = is_stream and "ai_stream" or "ai_chat" + if body_tab.model then + ctx.var.request_llm_model = body_tab.model + ctx.var.llm_model = body_tab.model + end + ctx.var.llm_response_text = cached_text +end + + function _M.check_schema(conf) local ok, err = core.schema.check(schema.schema, conf) if not ok then @@ -125,6 +136,7 @@ function _M.access(conf, ctx) else core.response.set_header("Content-Type", "application/json") end + populate_ai_ctx_on_hit(ctx, protocol_name, body_tab, is_stream, cached_text) -- TODO: rename build_deny_response to build_response_from_text in a -- follow-up. We use it here to wrap cached text in the protocol's -- response shape, not for policy denial. @@ -175,6 +187,7 @@ function _M.access(conf, ctx) else core.response.set_header("Content-Type", "application/json") end + populate_ai_ctx_on_hit(ctx, protocol_name, body_tab, is_stream, cached_text) return 200, proto.build_deny_response({ stream = is_stream, text = cached_text, diff --git a/t/plugin/prometheus-ai-cache.t b/t/plugin/prometheus-ai-cache.t index 02544fe97df7..0863402ea521 100644 --- a/t/plugin/prometheus-ai-cache.t +++ b/t/plugin/prometheus-ai-cache.t @@ -471,3 +471,11 @@ qr/apisix_ai_cache_hits_total\{route_id="4",service_id="",consumer="alice",layer GET /apisix/prometheus/metrics --- response_body_like eval qr/apisix_ai_cache_misses_total\{route_id="4",service_id="",consumer="alice"\} 1/ + + + +=== TEST 22: verify cache hit is labelled as ai_chat (not traditional_http) +--- request +GET /apisix/prometheus/metrics +--- response_body_like eval +qr/apisix_http_status\{code="200",route="1"[^}]*request_type="ai_chat"[^}]*response_source="apisix"[^}]*\} 1/ From 84c6b56b07456b6a4ddc7fe05e53159b91eee054 Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Tue, 5 May 2026 02:57:28 +0800 Subject: [PATCH 26/38] fix(ai-cache): reject empty layers array via schema --- apisix/plugins/ai-cache/schema.lua | 1 + 1 file changed, 1 insertion(+) diff --git a/apisix/plugins/ai-cache/schema.lua b/apisix/plugins/ai-cache/schema.lua index 9416af97f638..9bb417183424 100644 --- a/apisix/plugins/ai-cache/schema.lua +++ b/apisix/plugins/ai-cache/schema.lua @@ -94,6 +94,7 @@ _M.schema = { type = "array", items = { type = "string", enum = { "exact", "semantic" } }, uniqueItems = true, + minItems = 1, default = { "exact", "semantic" }, }, cache_key = { From 8f9fbfd6adb5366a2a715f0133c145e7303bd9fc Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Tue, 5 May 2026 03:03:49 +0800 Subject: [PATCH 27/38] fix(ai-cache): handle missing/dropped RediSearch index gracefully --- apisix/plugins/ai-cache/semantic.lua | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/apisix/plugins/ai-cache/semantic.lua b/apisix/plugins/ai-cache/semantic.lua index 4d4cf1c8dedd..0e7519993cd1 100644 --- a/apisix/plugins/ai-cache/semantic.lua +++ b/apisix/plugins/ai-cache/semantic.lua @@ -48,8 +48,13 @@ local function pack_vector(vec) end local index_ready = {} +local index_unsupported = false local function ensure_index(red, dim) + if index_unsupported then + return nil, "RediSearch not supported on this Redis instance" + end + if index_ready[dim] then return true end @@ -67,8 +72,16 @@ local function ensure_index(red, dim) "created_at", "NUMERIC" ) - if err and not err:find("already exists") then - return nil, "FT.CREATE failed: " .. err + if err then + -- RediSearch module absent — latch and stop retrying on every request + if err:find("unknown command", 1, true) + or err:find("ERR unknown", 1, true) then + index_unsupported = true + return nil, "RediSearch not supported on this Redis instance: " .. err + end + if not err:find("already exists") then + return nil, "FT.CREATE failed: " .. err + end end index_ready[dim] = true @@ -112,6 +125,10 @@ function _M.search(conf, scope_hash, embedding_vec, threshold) red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool) if search_err then + -- index was dropped externally — invalidate so next call recreates + if search_err:find("Unknown Index name", 1, true) then + index_ready[#embedding_vec] = nil + end return nil, nil, search_err end From d2eca3d2d2d5764a89b35a05c8146c8a6f9ecc85 Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Tue, 5 May 2026 03:22:44 +0800 Subject: [PATCH 28/38] test(ai-cache): cover symmetric HIT for include_consumer and L2 scope --- t/plugin/ai-cache-scope.t | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/t/plugin/ai-cache-scope.t b/t/plugin/ai-cache-scope.t index 97614fe173f8..21facdcf4688 100644 --- a/t/plugin/ai-cache-scope.t +++ b/t/plugin/ai-cache-scope.t @@ -257,10 +257,24 @@ X-AI-Fixture: openai/chat-basic.json --- error_code: 200 --- response_headers X-AI-Cache-Status: MISS +--- wait: 1 + + + +=== TEST 9: bob same prompt again - HIT-L1 (proves bob has own cache) +--- request +POST /per-consumer +{"messages":[{"role":"user","content":"per-consumer prompt"}]} +--- more_headers +Content-Type: application/json +apikey: bob-key +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: HIT-L1 -=== TEST 9: set up route with L2 semantic + cache_key include_vars +=== TEST 10: set up route with L2 semantic + cache_key include_vars --- config location /t { content_by_lua_block { @@ -313,7 +327,7 @@ passed -=== TEST 10: tenant-a first request - MISS, writes to L2 with scope=hash(tenant-a) +=== TEST 11: tenant-a first request - MISS, writes to L2 with scope=hash(tenant-a) --- request POST /scoped-semantic {"messages":[{"role":"user","content":"What is the capital of France??"}]} @@ -328,7 +342,7 @@ X-AI-Cache-Status: MISS -=== TEST 11: tenant-b same prompt - MISS (FT.SEARCH scope filter excludes tenant-a's entry) +=== TEST 12: tenant-b same prompt - MISS (FT.SEARCH scope filter excludes tenant-a's entry) --- request POST /scoped-semantic {"messages":[{"role":"user","content":"What is the capital of France??"}]} @@ -339,10 +353,11 @@ X-AI-Fixture: openai/chat-basic.json --- error_code: 200 --- response_headers X-AI-Cache-Status: MISS +--- wait: 1 -=== TEST 12: tenant-a paraphrase - HIT-L2 (scope filter finds tenant-a's entry) +=== TEST 13: tenant-a paraphrase - HIT-L2 (scope filter finds tenant-a's entry) --- request POST /scoped-semantic {"messages":[{"role":"user","content":"Name the capital city of France"}]} @@ -353,3 +368,17 @@ X-AI-Fixture: openai/chat-basic.json --- error_code: 200 --- response_headers X-AI-Cache-Status: HIT-L2 + + + +=== TEST 14: tenant-b paraphrase - HIT-L2 (proves tenant-b has own L2 entry) +--- request +POST /scoped-semantic +{"messages":[{"role":"user","content":"Name the capital city of France"}]} +--- more_headers +Content-Type: application/json +X-Tenant-Id: tenant-b +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: HIT-L2 From f6923223aa6e1ea9e6e20933b142c2e925a55ad8 Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Tue, 5 May 2026 05:56:06 +0800 Subject: [PATCH 29/38] test(ai-cache): expand, tighten, and reorganize cache plugin tests --- t/plugin/ai-cache.t | 445 ++++++++++++++++----------------- t/plugin/prometheus-ai-cache.t | 4 +- 2 files changed, 211 insertions(+), 238 deletions(-) diff --git a/t/plugin/ai-cache.t b/t/plugin/ai-cache.t index 1657fbec3d76..b8e1557c2cca 100644 --- a/t/plugin/ai-cache.t +++ b/t/plugin/ai-cache.t @@ -133,6 +133,7 @@ passed layers = { "semantic" }, redis_host = "127.0.0.1", }) + if not ok then ngx.say("failed: ", err) else @@ -153,15 +154,16 @@ failed: semantic layer requires semantic.embedding to be configured local ok, err = plugin.check_schema({ layers = { "invalid_layer" }, }) + if not ok then - ngx.say("failed") + ngx.say(err) else ngx.say("passed") end } } ---- response_body -failed +--- response_body eval +qr/.*property "layers" validation failed:.*matches none of the enum values.*/ @@ -182,14 +184,14 @@ failed }) if not ok then - ngx.say("failed") + ngx.say(err) else ngx.say("passed") end } } ---- response_body -failed +--- response_body eval +qr/.*property "provider" validation failed: matches none of the enum values.*/ @@ -211,18 +213,40 @@ failed }) if not ok then - ngx.say("failed") + ngx.say(err) else ngx.say("passed") end } } ---- response_body -failed +--- response_body eval +qr/.*property "similarity_threshold" validation failed: expected 1\.5 to be at most.*/ -=== TEST 7: set up route for L1 cache tests +=== TEST 7: layers empty array - should fail (minItems=1) +--- config + location /t { + content_by_lua_block { + local plugin = require("apisix.plugins.ai-cache") + local ok, err = plugin.check_schema({ + layers = {}, + redis_host = "127.0.0.1", + }) + + if not ok then + ngx.say(err) + else + ngx.say("passed") + end + } + } +--- response_body eval +qr/.*property "layers" validation failed: expect array to have at least 1 items.*/ + + + +=== TEST 8: set up route for L1 cache tests --- config location /t { content_by_lua_block { @@ -230,7 +254,7 @@ failed local code, body = t('/apisix/admin/routes/1', ngx.HTTP_PUT, [[{ - "uri": "/chat", + "uri": "/exact", "plugins": { "ai-proxy": { "provider": "openai", @@ -264,9 +288,9 @@ passed -=== TEST 8: first request - cache MISS, upstream called +=== TEST 9: first request - cache MISS, upstream called --- request -POST /chat +POST /exact {"messages":[{"role":"user","content":"What is the answer to life?"}]} --- more_headers Content-Type: application/json @@ -275,13 +299,13 @@ X-AI-Fixture: openai/chat-basic.json --- response_headers X-AI-Cache-Status: MISS --- response_body_like eval -qr/content/ +qr/\{ "content": "1 \+ 1 = 2\.", "role": "assistant" \}/ -=== TEST 9: second identical request - cache HIT-L1, no upstream call +=== TEST 10: second identical request - cache HIT-L1, no upstream call --- request -POST /chat +POST /exact {"messages":[{"role":"user","content":"What is the answer to life?"}]} --- more_headers Content-Type: application/json @@ -289,16 +313,18 @@ X-AI-Fixture: openai/chat-basic.json --- error_code: 200 --- response_headers X-AI-Cache-Status: HIT-L1 +--- response_headers_like +X-AI-Cache-Age: \d+ --- response_body_like eval -qr/content/ +qr/"content":\s?"1 \+ 1 = 2\."/ --- error_log ai-cache: L1 hit for key -=== TEST 10: bypass header - BYPASS, upstream called, not cached +=== TEST 11: bypass header - BYPASS, upstream called, not cached --- request -POST /chat +POST /exact {"messages":[{"role":"user","content":"What is the bypass question?"}]} --- more_headers Content-Type: application/json @@ -310,9 +336,9 @@ X-AI-Cache-Status: BYPASS -=== TEST 11: same prompt without bypass after bypass - still MISS (bypass did not cache) +=== TEST 12: same prompt without bypass after bypass - still MISS (bypass did not cache) --- request -POST /chat +POST /exact {"messages":[{"role":"user","content":"What is the bypass question?"}]} --- more_headers Content-Type: application/json @@ -323,7 +349,7 @@ X-AI-Cache-Status: MISS -=== TEST 12: set up route with two bypass rules +=== TEST 13: set up route with two bypass rules --- config location /t { content_by_lua_block { @@ -331,7 +357,7 @@ X-AI-Cache-Status: MISS local code, body = t('/apisix/admin/routes/1', ngx.HTTP_PUT, [[{ - "uri": "/chat", + "uri": "/exact", "plugins": { "ai-proxy": { "provider": "openai", @@ -368,9 +394,9 @@ passed -=== TEST 13: first bypass rule matches - BYPASS +=== TEST 14: first bypass rule matches - BYPASS --- request -POST /chat +POST /exact {"messages":[{"role":"user","content":"multi-rule bypass test"}]} --- more_headers Content-Type: application/json @@ -382,9 +408,9 @@ X-AI-Cache-Status: BYPASS -=== TEST 14: second bypass rule matches - BYPASS +=== TEST 15: second bypass rule matches - BYPASS --- request -POST /chat +POST /exact {"messages":[{"role":"user","content":"multi-rule bypass test"}]} --- more_headers Content-Type: application/json @@ -396,7 +422,7 @@ X-AI-Cache-Status: BYPASS -=== TEST 15: set up route for 4xx test +=== TEST 16: set up route for upstream-status filter tests --- config location /t { content_by_lua_block { @@ -437,226 +463,190 @@ passed -=== TEST 16: 4xx from upstream - not cached +=== TEST 17: non-2xx upstream response - not cached (status code filter) --- request POST /error -{"messages":[{"role":"user","content":"trigger an error please"}]} +{"messages":[{"role":"user","content":"trigger a server error"}]} --- more_headers Content-Type: application/json X-AI-Fixture: openai/chat-basic.json -X-AI-Fixture-Status: 400 ---- error_code: 400 +X-AI-Fixture-Status: 500 +--- error_code: 500 --- response_headers X-AI-Cache-Status: MISS -=== TEST 17: same prompt after 4xx - still MISS (4xx was not cached) +=== TEST 18: same prompt after non-2xx - still MISS (was not cached) --- request POST /error -{"messages":[{"role":"user","content":"trigger an error please"}]} +{"messages":[{"role":"user","content":"trigger a server error"}]} --- more_headers Content-Type: application/json X-AI-Fixture: openai/chat-basic.json -X-AI-Fixture-Status: 400 ---- error_code: 400 +X-AI-Fixture-Status: 500 +--- error_code: 500 --- response_headers X-AI-Cache-Status: MISS -=== TEST 18: openai driver - parses embedding vector correctly ---- http_config -server { - listen 1990; - default_type 'application/json'; - - location /v1/embeddings { - content_by_lua_block { - local cjson = require("cjson.safe") - ngx.req.read_body() - local body = cjson.decode(ngx.req.get_body_data()) - - if ngx.req.get_headers()["Authorization"] ~= "Bearer test-key" then - ngx.status = 401 - ngx.say('{"error":"unauthorized"}') - return - end - - ngx.status = 200 - ngx.say(cjson.encode({ - data = { - { embedding = {0.1, 0.2, 0.3}, index = 0, object = "embedding" } - }, - model = body.model, - object = "list" - })) - } - } -} +=== TEST 19: set up route with very small max_cache_body_size --- config location /t { content_by_lua_block { - local http = require("resty.http") - local driver = require("apisix.plugins.ai-cache.embeddings.openai") - - local httpc = http.new() - local conf = { - endpoint = "http://127.0.0.1:1990/v1/embeddings", - api_key = "test-key", - model = "text-embedding-3-small", - } - - local embedding, status, err = driver.get_embeddings(conf, "hello world", httpc, false) - if not embedding then - ngx.say("error: ", err) - return - end + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/3', + ngx.HTTP_PUT, + [[{ + "uri": "/tiny", + "plugins": { + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer test-key" + } + }, + "override": { + "endpoint": "http://127.0.0.1:1980/v1/chat/completions" + } + }, + "ai-cache": { + "layers": ["exact"], + "exact": { "ttl": 60 }, + "max_cache_body_size": 5, + "redis_host": "127.0.0.1" + } + } + }]] + ) - if #embedding ~= 3 then - ngx.say("wrong length: ", #embedding) - return + if code >= 300 then + ngx.status = code end - - ngx.say("ok: ", embedding[1], " ", embedding[2], " ", embedding[3]) + ngx.say(body) } } --- response_body -ok: 0.1 0.2 0.3 +passed -=== TEST 19: openai driver - 429 from API return nil with status ---- http_config -server { - listen 1990; - default_type 'application/json'; - - location /v1/embeddings { - content_by_lua_block { - ngx.status = 429 - ngx.say('{"error":{"message":"rate limit exceeded","type":"requests"}}') - } - } -} ---- config - location /t { - content_by_lua_block { - local http = require("resty.http") - local driver = require("apisix.plugins.ai-cache.embeddings.openai") - - local httpc = http.new() - local conf = { - endpoint = "http://127.0.0.1:1990/v1/embeddings", - api_key = "test-key", - } - - local embedding, status, err = driver.get_embeddings(conf, "hello", httpc, false) - if embedding then - ngx.say("unexpected success") - return - end - - ngx.say("status: ", status) - } - } ---- response_body -status: 429 +=== TEST 20: oversize response - MISS, log warns and skips cache write +--- request +POST /tiny +{"messages":[{"role":"user","content":"oversize body test"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS +--- response_body_like eval +qr/\{ "content": "1 \+ 1 = 2\.", "role": "assistant" \}/ +--- error_log +exceeds max_cache_body_size -=== TEST 20: azure_openai driver - parses embedding vector correctly ---- http_config -server { - listen 1990; - default_type 'application/json'; +=== TEST 21: same prompt after oversize - still MISS (was not cached) +--- request +POST /tiny +{"messages":[{"role":"user","content":"oversize body test"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS +--- response_body_like eval +qr/\{ "content": "1 \+ 1 = 2\.", "role": "assistant" \}/ +--- error_log +exceeds max_cache_body_size - location /embeddings { - content_by_lua_block { - local cjson = require("cjson.safe") - if ngx.req.get_headers()["api-key"] ~= "azure-test-key" then - ngx.status = 401 - ngx.say('{"error":"unauthorized"}') - return - end - ngx.status = 200 - ngx.say(cjson.encode({ - data = { - { embedding = {0.4, 0.5, 0.6}, index = 0, object = "embedding" } - }, - object = "list" - })) - } - } -} +=== TEST 22: set up route with custom cache header names --- config location /t { content_by_lua_block { - local http = require("resty.http") - local driver = require("apisix.plugins.ai-cache.embeddings.azure_openai") - - local httpc = http.new() - local conf = { - endpoint = "http://127.0.0.1:1990/embeddings", - api_key = "azure-test-key", - } - - local embedding, status, err = driver.get_embeddings(conf, "hello world", httpc, false) - if not embedding then - ngx.say("error: ", err) - return - end + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/4', + ngx.HTTP_PUT, + [[{ + "uri": "/custom-headers", + "plugins": { + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer test-key" + } + }, + "override": { + "endpoint": "http://127.0.0.1:1980/v1/chat/completions" + } + }, + "ai-cache": { + "layers": ["exact"], + "exact": { "ttl": 60 }, + "headers": { + "cache_status": "X-Custom-Status", + "cache_age": "X-Custom-Age" + }, + "redis_host": "127.0.0.1" + } + } + }]] + ) - ngx.say("ok: ", embedding[1], " ", embedding[2], " ", embedding[3]) + if code >= 300 then + ngx.status = code + end + ngx.say(body) } } --- response_body -ok: 0.4 0.5 0.6 +passed -=== TEST 21: openai driver - 500 from API returns nil with status ---- http_config -server { - listen 1990; - default_type 'application/json'; +=== TEST 23: MISS populates the cache and emits custom status header +--- request +POST /custom-headers +{"messages":[{"role":"user","content":"custom header test"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-Custom-Status: MISS +--- response_body_like eval +qr/\{ "content": "1 \+ 1 = 2\.", "role": "assistant" \}/ +--- wait: 1 - location /v1/embeddings { - content_by_lua_block { - ngx.status = 500 - ngx.say('{"error":{"message":"internal server error"}}') - } - } -} ---- config - location /t { - content_by_lua_block { - local http = require("resty.http") - local driver = require("apisix.plugins.ai-cache.embeddings.openai") - - local httpc = http.new() - local conf = { - endpoint = "http://127.0.0.1:1990/v1/embeddings", - api_key = "test-key", - } - - local embedding, status, err = driver.get_embeddings(conf, "hello", httpc, false) - if embedding then - ngx.say("unexpected success") - return - end - ngx.say("status: ", status) - } - } ---- response_body -status: 500 +=== TEST 24: HIT emits custom status and age headers (defaults not used) +--- request +POST /custom-headers +{"messages":[{"role":"user","content":"custom header test"}]} +--- more_headers +Content-Type: application/json +--- error_code: 200 +--- response_headers +X-Custom-Status: HIT-L1 +X-AI-Cache-Status: +X-AI-Cache-Age: +--- response_headers_like +X-Custom-Age: \d+ +--- response_body_like eval +qr/"content":\s?"1 \+ 1 = 2\."/ -=== TEST 22: clean up L2 state before semantic tests + +=== TEST 25: clean up Redis cache state before semantic tests --- config location /t { content_by_lua_block { @@ -665,7 +655,6 @@ status: 500 red:set_timeout(1000) assert(red:connect("127.0.0.1", 6379)) - red["FT.DROPINDEX"](red, "ai-cache-idx", "DD") red["FT.DROPINDEX"](red, "ai-cache-idx-3", "DD") local keys = red:keys("ai-cache:*") @@ -682,12 +671,12 @@ ok -=== TEST 23: set up route for L2 semantic cache tests +=== TEST 26: set up route for L2 semantic cache tests --- config location /t { content_by_lua_block { local t = require("lib.test_admin").test - local code, body = t('/apisix/admin/routes/3', + local code, body = t('/apisix/admin/routes/5', ngx.HTTP_PUT, [[{ "uri": "/semantic", @@ -733,7 +722,7 @@ passed -=== TEST 24: L2 - first request, cache MISS, stored in L2 +=== TEST 27: L2 - first request, cache MISS, stored in L2 --- request POST /semantic {"messages":[{"role":"user","content":"What is the capital of France??"}]} @@ -743,10 +732,12 @@ X-AI-Fixture: openai/chat-basic.json --- error_code: 200 --- response_headers X-AI-Cache-Status: MISS +--- response_body_like eval +qr/\{ "content": "1 \+ 1 = 2\.", "role": "assistant" \}/ -=== TEST 25: L2 - different wording hits L2 (same vector from fixture) +=== TEST 28: L2 - different wording hits L2 (same vector from fixture) --- request POST /semantic {"messages":[{"role":"user","content":"Name the capital city of France"}]} @@ -756,55 +747,35 @@ X-AI-Fixture: openai/chat-basic.json --- error_code: 200 --- response_headers X-AI-Cache-Status: HIT-L2 +--- response_headers_like +X-AI-Cache-Similarity: \d+(\.\d+)? --- response_body_like eval -qr/content/ +qr/"content":\s?"1 \+ 1 = 2\."/ --- error_log ai-cache: L2 hit -=== TEST 26: L2 - original prompt now hits L1 (backfilled by the L2 hit) +=== TEST 29: L2 - paraphrase now hits L1 (backfilled by the previous L2 hit) --- request POST /semantic -{"messages":[{"role":"user","content":"What is the capital of France??"}]} +{"messages":[{"role":"user","content":"Name the capital city of France"}]} --- more_headers Content-Type: application/json X-AI-Fixture: openai/chat-basic.json --- error_code: 200 --- response_headers X-AI-Cache-Status: HIT-L1 +--- response_body_like eval +qr/"content":\s?"1 \+ 1 = 2\."/ --- error_log ai-cache: L1 hit for key -=== TEST 27: L2 degradation - search error results in MISS, not 500 ---- config - location /t { - content_by_lua_block { - local semantic = require("apisix.plugins.ai-cache.semantic") - local conf = { - redis_host = "127.0.0.1", - redis_port = 6379, - redis_timeout = 100, - } - - local text, sim, err = semantic.search(conf, "", {0.1, 0.2, 0.3}, 0.95) - if err then - ngx.say("degraded gracefully") - else - ngx.say("miss, no error") - end - } - } ---- response_body_like eval -qr/degraded gracefully|miss, no error/ - - - -=== TEST 28: streaming MISS - upstream called, response cached via log phase +=== TEST 30: streaming MISS - upstream called, response cached via log phase --- request -POST /chat +POST /exact {"messages":[{"role":"user","content":"Stream me something cool"}],"stream":true} --- more_headers Content-Type: application/json @@ -812,12 +783,14 @@ X-AI-Fixture: openai/chat-streaming.sse --- error_code: 200 --- response_headers X-AI-Cache-Status: MISS +--- response_body_like eval +qr/data:.*"content":"Hello"/ -=== TEST 29: streaming HIT - Content-Type is text/event-stream, SSE body returned +=== TEST 31: streaming HIT - Content-Type is text/event-stream, SSE body returned --- request -POST /chat +POST /exact {"messages":[{"role":"user","content":"Stream me something cool"}],"stream":true} --- more_headers Content-Type: application/json @@ -826,14 +799,14 @@ Content-Type: application/json X-AI-Cache-Status: HIT-L1 Content-Type: text/event-stream --- response_body_like eval -qr/data:.*content/ +qr/data:.*"content":\s?"Hello!"/ --- wait: 1 -=== TEST 30: non-streaming HIT after streaming MISS - returns JSON +=== TEST 32: non-streaming HIT after streaming MISS - returns JSON --- request -POST /chat +POST /exact {"messages":[{"role":"user","content":"Stream me something cool"}]} --- more_headers Content-Type: application/json @@ -842,4 +815,4 @@ Content-Type: application/json X-AI-Cache-Status: HIT-L1 Content-Type: application/json --- response_body_like eval -qr/content/ +qr/"content":\s?"Hello!"/ diff --git a/t/plugin/prometheus-ai-cache.t b/t/plugin/prometheus-ai-cache.t index 0863402ea521..3af1a6ae2491 100644 --- a/t/plugin/prometheus-ai-cache.t +++ b/t/plugin/prometheus-ai-cache.t @@ -323,7 +323,7 @@ qr/apisix_ai_cache_hits_total\{route_id="1",service_id="",consumer="",layer="l1" -=== TEST 9: cleanup Redis L2 state before semantic tests +=== TEST 9: cleanup Redis cache state before semantic tests --- config location /t { content_by_lua_block { @@ -332,7 +332,7 @@ qr/apisix_ai_cache_hits_total\{route_id="1",service_id="",consumer="",layer="l1" red:set_timeout(1000) assert(red:connect("127.0.0.1", 6379)) - red["FT.DROPINDEX"](red, "ai-cache-idx", "DD") + red["FT.DROPINDEX"](red, "ai-cache-idx-3", "DD") local keys = red:keys("ai-cache:*") if type(keys) == "table" and #keys > 0 then From 439b9f03ce988dfa23a6d10a37d845bc87855d7c Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Tue, 5 May 2026 06:43:35 +0800 Subject: [PATCH 30/38] fix(ai-cache): expose embedding ssl_verify, document model + timeout --- apisix/plugins/ai-cache.lua | 6 ++++-- apisix/plugins/ai-cache/embeddings/azure_openai.lua | 6 ++++++ apisix/plugins/ai-cache/embeddings/openai.lua | 6 ++++++ apisix/plugins/ai-cache/schema.lua | 8 ++++++++ docs/en/latest/plugins/ai-cache.md | 4 +++- 5 files changed, 27 insertions(+), 3 deletions(-) diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua index 69f24bec9bfb..cf914ab5d59b 100644 --- a/apisix/plugins/ai-cache.lua +++ b/apisix/plugins/ai-cache.lua @@ -154,7 +154,9 @@ function _M.access(conf, ctx) local httpc = http.new() local t0 = ngx_now() - local embedding, _, emb_err = emb_driver.get_embeddings(emb_conf, prompt_text, httpc, true) + local embedding, _, emb_err = emb_driver.get_embeddings( + emb_conf, prompt_text, httpc, emb_conf.ssl_verify + ) if not embedding then core.log.warn("ai-cache: embedding fetch failed (degrading to MISS): ", emb_err) ctx.ai_cache_embedding_failed = true @@ -285,7 +287,7 @@ function _M.log(conf, ctx) ) local httpc = http.new() local emb, _, emb_err = emb_driver.get_embeddings( - emb_conf, prompt_text, httpc, true + emb_conf, prompt_text, httpc, emb_conf.ssl_verify ) if not emb then ngx.log(ngx.WARN, diff --git a/apisix/plugins/ai-cache/embeddings/azure_openai.lua b/apisix/plugins/ai-cache/embeddings/azure_openai.lua index 928a803f948e..6f862ea78cc8 100644 --- a/apisix/plugins/ai-cache/embeddings/azure_openai.lua +++ b/apisix/plugins/ai-cache/embeddings/azure_openai.lua @@ -31,6 +31,8 @@ function _M.get_embeddings(conf, text, httpc, ssl_verify) return nil, HTTP_INTERNAL_SERVER_ERROR, err end + httpc:set_timeout(conf.timeout) + local res, err = httpc:request_uri(conf.endpoint, { method = "POST", headers = { @@ -39,6 +41,7 @@ function _M.get_embeddings(conf, text, httpc, ssl_verify) }, body = body, ssl_verify = ssl_verify, + keepalive = true, }) if not res or not res.body then @@ -62,6 +65,9 @@ function _M.get_embeddings(conf, text, httpc, ssl_verify) if type(embedding) ~= "table" then return nil, HTTP_INTERNAL_SERVER_ERROR, "missing embedding field in response" end + if #embedding == 0 then + return nil, HTTP_INTERNAL_SERVER_ERROR, "embedding vector is empty" + end return embedding, nil, nil end diff --git a/apisix/plugins/ai-cache/embeddings/openai.lua b/apisix/plugins/ai-cache/embeddings/openai.lua index 0ca8c7cd61af..740b12d23f2d 100644 --- a/apisix/plugins/ai-cache/embeddings/openai.lua +++ b/apisix/plugins/ai-cache/embeddings/openai.lua @@ -34,6 +34,8 @@ function _M.get_embeddings(conf, text, httpc, ssl_verify) return nil, HTTP_INTERNAL_SERVER_ERROR, err end + httpc:set_timeout(conf.timeout) + local res, err = httpc:request_uri(conf.endpoint, { method = "POST", headers = { @@ -42,6 +44,7 @@ function _M.get_embeddings(conf, text, httpc, ssl_verify) }, body = body, ssl_verify = ssl_verify, + keepalive = true, }) if not res or not res.body then @@ -65,6 +68,9 @@ function _M.get_embeddings(conf, text, httpc, ssl_verify) if type(embedding) ~= "table" then return nil, HTTP_INTERNAL_SERVER_ERROR, "missing embedding field in response" end + if #embedding == 0 then + return nil, HTTP_INTERNAL_SERVER_ERROR, "embedding vector is empty" + end return embedding, nil, nil end diff --git a/apisix/plugins/ai-cache/schema.lua b/apisix/plugins/ai-cache/schema.lua index 9bb417183424..57e4b9892045 100644 --- a/apisix/plugins/ai-cache/schema.lua +++ b/apisix/plugins/ai-cache/schema.lua @@ -29,6 +29,14 @@ local embedding_schema = { model = { type = "string" }, endpoint = { type = "string" }, api_key = { type = "string" }, + timeout = { + type = "integer", + minimum = 1, + maximum = 600000, + default = 5000, + description = "timeout in milliseconds", + }, + ssl_verify = { type = "boolean", default = true }, }, required = { "provider", "endpoint", "api_key" }, } diff --git a/docs/en/latest/plugins/ai-cache.md b/docs/en/latest/plugins/ai-cache.md index 3c1789131134..f41559260c70 100644 --- a/docs/en/latest/plugins/ai-cache.md +++ b/docs/en/latest/plugins/ai-cache.md @@ -52,7 +52,9 @@ The Plugin should be used together with [ai-proxy](./ai-proxy.md) or [ai-proxy-m | `semantic.embedding.provider` | string | True (if semantic enabled) | | `"openai"`, `"azure_openai"` | Embedding API provider. | | `semantic.embedding.endpoint` | string | True (if semantic enabled) | | | Embedding API endpoint URL. | | `semantic.embedding.api_key` | string | True (if semantic enabled) | | | API key for the embedding provider. Stored encrypted. | -| `semantic.embedding.model` | string | False | | | Embedding model name. Uses provider default if omitted. | +| `semantic.embedding.model` | string | False | | | Embedding model name. Sent in the request body for `provider: openai`; ignored for `provider: azure_openai` (Azure infers the model from the deployment URL). Uses provider default if omitted. | +| `semantic.embedding.timeout` | integer | False | `5000` | [1, 600000] | HTTP request timeout in milliseconds for embedding API calls. | +| `semantic.embedding.ssl_verify` | boolean | False | `true` | | Whether to verify the embedding endpoint's TLS certificate. | | `cache_key.include_consumer` | boolean | False | `false` | | If `true`, partition the cache by consumer name. | | `cache_key.include_vars` | array[string] | False | `[]` | | Additional `ctx.var` names included in the cache key, for example `["$http_x_tenant_id"]`. | | `bypass_on` | array[object] | False | | | List of `{header, equals}` rules. If any matches, the request bypasses the cache. | From 61530f85968c1c013f1b2849534aeac1bceac591 Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Tue, 5 May 2026 06:51:51 +0800 Subject: [PATCH 31/38] docs(ai-cache): add field descriptions to schema; cap top_k at 100 --- apisix/plugins/ai-cache/schema.lua | 75 +++++++++++++++++++++++++----- docs/en/latest/plugins/ai-cache.md | 2 +- 2 files changed, 65 insertions(+), 12 deletions(-) diff --git a/apisix/plugins/ai-cache/schema.lua b/apisix/plugins/ai-cache/schema.lua index 57e4b9892045..02587f7fb14c 100644 --- a/apisix/plugins/ai-cache/schema.lua +++ b/apisix/plugins/ai-cache/schema.lua @@ -25,18 +25,34 @@ local embedding_schema = { provider = { type = "string", enum = { "openai", "azure_openai" }, + description = "Embedding API provider.", + }, + model = { + type = "string", + description = "Embedding model name. Sent in the request body for " + .. "provider: openai; ignored for provider: azure_openai " + .. "(Azure infers the model from the deployment URL).", + }, + endpoint = { + type = "string", + description = "Embedding API endpoint URL.", + }, + api_key = { + type = "string", + description = "API key for the embedding provider.", }, - model = { type = "string" }, - endpoint = { type = "string" }, - api_key = { type = "string" }, timeout = { type = "integer", minimum = 1, maximum = 600000, default = 5000, - description = "timeout in milliseconds", + description = "HTTP request timeout in milliseconds for embedding API calls.", + }, + ssl_verify = { + type = "boolean", + default = true, + description = "Whether to verify the embedding endpoint's TLS certificate.", }, - ssl_verify = { type = "boolean", default = true }, }, required = { "provider", "endpoint", "api_key" }, } @@ -49,16 +65,21 @@ local semantic_schema = { minimum = 0, maximum = 1, default = 0.95, + description = "Minimum cosine similarity required for a semantic-layer hit.", }, top_k = { type = "integer", minimum = 1, + maximum = 100, default = 1, + description = "Number of nearest-neighbor candidates the index returns; " + .. "the first candidate above similarity_threshold is used.", }, ttl = { type = "integer", minimum = 1, default = 86400, + description = "Time-to-live in seconds for semantic-layer entries.", }, embedding = embedding_schema, }, @@ -72,6 +93,7 @@ local exact_schema = { type = "integer", minimum = 1, default = 3600, + description = "Time-to-live in seconds for exact-layer entries.", }, }, } @@ -80,8 +102,15 @@ local exact_schema = { local bypass_item_schema = { type = "object", properties = { - header = { type = "string" }, - equals = { type = "string" }, + header = { + type = "string", + description = "Request header name to inspect.", + }, + equals = { + type = "string", + description = "Value to match against the header. " + .. "If equal, the request bypasses the cache.", + }, }, required = { "header", "equals" }, } @@ -89,9 +118,22 @@ local bypass_item_schema = { local headers_schema = { type = "object", properties = { - cache_status = { type = "string", default = "X-AI-Cache-Status" }, - cache_similarity = { type = "string", default = "X-AI-Cache-Similarity" }, - cache_age = { type = "string", default = "X-AI-Cache-Age" }, + cache_status = { + type = "string", + default = "X-AI-Cache-Status", + description = "Response header name for cache status " + .. "(HIT-L1 / HIT-L2 / MISS / BYPASS).", + }, + cache_similarity = { + type = "string", + default = "X-AI-Cache-Similarity", + description = "Response header name for the similarity score of a semantic-layer hit.", + }, + cache_age = { + type = "string", + default = "X-AI-Cache-Age", + description = "Response header name for the age in seconds of an exact-layer hit.", + }, }, } @@ -104,15 +146,22 @@ _M.schema = { uniqueItems = true, minItems = 1, default = { "exact", "semantic" }, + description = "Cache layers to enable, queried in order.", }, cache_key = { type = "object", properties = { - include_consumer = {type = "boolean", default = false }, + include_consumer = { + type = "boolean", + default = false, + description = "If true, partition the cache by consumer name.", + }, include_vars = { type = "array", items = { type = "string" }, default = {}, + description = "Additional ctx.var names included in the cache key, " + .. "for example [\"$http_x_tenant_id\"].", }, }, }, @@ -121,12 +170,16 @@ _M.schema = { bypass_on = { type = "array", items = bypass_item_schema, + description = "List of {header, equals} rules. " + .. "If any matches, the request bypasses the cache.", }, headers = headers_schema, max_cache_body_size = { type = "integer", minimum = 1, default = 1048576, + description = "Maximum response size in bytes to write to cache. " + .. "Larger responses pass through but are not cached.", }, }, allOf = { redis_schema.schema.redis }, diff --git a/docs/en/latest/plugins/ai-cache.md b/docs/en/latest/plugins/ai-cache.md index f41559260c70..cec70ae3b675 100644 --- a/docs/en/latest/plugins/ai-cache.md +++ b/docs/en/latest/plugins/ai-cache.md @@ -47,7 +47,7 @@ The Plugin should be used together with [ai-proxy](./ai-proxy.md) or [ai-proxy-m | `layers` | array[string] | False | `["exact", "semantic"]` | `"exact"`, `"semantic"` | Cache layers to enable, queried in order. | | `exact.ttl` | integer | False | `3600` | ≥ 1 | Time-to-live in seconds for exact-layer entries. | | `semantic.similarity_threshold` | number | False | `0.95` | 0–1 | Minimum cosine similarity required for a semantic-layer hit. | -| `semantic.top_k` | integer | False | `1` | ≥ 1 | Number of nearest-neighbor candidates the index returns; the first candidate above `similarity_threshold` is used. | +| `semantic.top_k` | integer | False | `1` | [1, 100] | Number of nearest-neighbor candidates the index returns; the first candidate above `similarity_threshold` is used. | | `semantic.ttl` | integer | False | `86400` | ≥ 1 | Time-to-live in seconds for semantic-layer entries. | | `semantic.embedding.provider` | string | True (if semantic enabled) | | `"openai"`, `"azure_openai"` | Embedding API provider. | | `semantic.embedding.endpoint` | string | True (if semantic enabled) | | | Embedding API endpoint URL. | From a1de7514eba6f7fec7f524b0f85b4cf76784c320 Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Tue, 5 May 2026 07:05:00 +0800 Subject: [PATCH 32/38] refactor(ai-cache): tighten exact.lua redis pooling and prune dead code --- apisix/plugins/ai-cache.lua | 7 +------ apisix/plugins/ai-cache/exact.lua | 26 ++++++++++++++------------ 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua index cf914ab5d59b..026a2ee45b1f 100644 --- a/apisix/plugins/ai-cache.lua +++ b/apisix/plugins/ai-cache.lua @@ -113,12 +113,7 @@ function _M.access(conf, ctx) local prompt_text = table_concat(contents, " ") local scope_hash = exact.compute_scope_hash(conf, ctx) - local prompt_hash, hash_err = exact.compute_prompt_hash(prompt_text) - if not prompt_hash then - core.log.warn("ai-cache: failed to compute prompt hash: ", hash_err) - ctx.ai_cache_status = "MISS" - return - end + local prompt_hash = exact.compute_prompt_hash(prompt_text) local is_stream = body_tab.stream == true diff --git a/apisix/plugins/ai-cache/exact.lua b/apisix/plugins/ai-cache/exact.lua index d442cd89c467..52f3f98f6f56 100644 --- a/apisix/plugins/ai-cache/exact.lua +++ b/apisix/plugins/ai-cache/exact.lua @@ -37,7 +37,6 @@ local function sha256_hex(s) return to_hex(hash:final()) end -_M.sha256_hex = sha256_hex function _M.compute_scope_hash(conf, ctx) local cache_key = conf.cache_key @@ -73,7 +72,7 @@ end function _M.compute_prompt_hash(text) - return sha256_hex(text), nil + return sha256_hex(text) end @@ -84,13 +83,14 @@ function _M.get(conf, scope_hash, prompt_hash) end local key = KEY_PREFIX .. scope_hash .. ":" .. prompt_hash - local res, err = red:get(key) - red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool) - - if err then - return nil, nil, err + local res, get_err = red:get(key) + if get_err then + red:close() + return nil, nil, get_err end + red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool) + if res == ngx.null then return nil, nil, nil end @@ -117,16 +117,18 @@ function _M.set(conf, scope_hash, prompt_hash, text, ttl) }) if not entry then - red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool) + red:close() return encode_err end - local ok, err = red:set(key, entry, "EX", ttl) - red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool) - + local ok, set_err = red:set(key, entry, "EX", ttl) if not ok then - return err + red:close() + return set_err end + + red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool) + return nil end From 7df710ead32152131bfb3206f29d68d7580b09d5 Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Tue, 5 May 2026 07:17:38 +0800 Subject: [PATCH 33/38] fix(ai-cache): tighten semantic.lua redis pooling and atomicize writes --- apisix/plugins/ai-cache/semantic.lua | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/apisix/plugins/ai-cache/semantic.lua b/apisix/plugins/ai-cache/semantic.lua index 0e7519993cd1..6d84dbd28675 100644 --- a/apisix/plugins/ai-cache/semantic.lua +++ b/apisix/plugins/ai-cache/semantic.lua @@ -97,7 +97,7 @@ function _M.search(conf, scope_hash, embedding_vec, threshold) local ok, init_err = ensure_index(red, #embedding_vec) if not ok then - red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool) + red:close() return nil, nil, init_err end @@ -122,9 +122,9 @@ function _M.search(conf, scope_hash, embedding_vec, threshold) "RETURN", "2", "response", "dist", "DIALECT", "2" ) - red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool) if search_err then + red:close() -- index was dropped externally — invalidate so next call recreates if search_err:find("Unknown Index name", 1, true) then index_ready[#embedding_vec] = nil @@ -132,6 +132,8 @@ function _M.search(conf, scope_hash, embedding_vec, threshold) return nil, nil, search_err end + red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool) + if not res or res[1] == 0 then return nil, nil, nil end @@ -173,26 +175,35 @@ function _M.store(conf, scope_hash, embedding_vec, text, ttl) local ok, init_err = ensure_index(red, #embedding_vec) if not ok then - red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool) + red:close() return init_err end local binary_vec = pack_vector(embedding_vec) local key = key_prefix(#embedding_vec) .. uuid.generate_v4() - local set_ok, set_err = red:hset(key, + -- HSET + EXPIRE wrapped in MULTI/EXEC so the entry is never written + -- without its TTL (which would orphan it in Redis forever). + local _, multi_err = red:multi() + if multi_err then + red:close() + return multi_err + end + + red:hset(key, "embedding", binary_vec, "response", text, "scope", scope_hash, "created_at", tostring(ngx_time()) ) + red:expire(key, ttl) - if not set_ok then - red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool) - return set_err + local results, exec_err = red:exec() + if not results then + red:close() + return exec_err end - red:expire(key, ttl) red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool) return nil end From bf250acfc2472ec9eca64fc61636801fa64dc5f8 Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Tue, 5 May 2026 07:50:10 +0800 Subject: [PATCH 34/38] refactor(ai-cache): drop log-phase embedding re-fetch --- apisix/plugins/ai-cache.lua | 45 ++++++++++++++----------------------- 1 file changed, 17 insertions(+), 28 deletions(-) diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua index 026a2ee45b1f..476bd785d85a 100644 --- a/apisix/plugins/ai-cache.lua +++ b/apisix/plugins/ai-cache.lua @@ -83,7 +83,6 @@ function _M.access(conf, ctx) local req_headers = ngx.req.get_headers() for _, rule in ipairs(conf.bypass_on) do if req_headers[rule.header] == rule.equals then - ctx.ai_cache_bypass = true ctx.ai_cache_status = "BYPASS" return end @@ -170,11 +169,14 @@ function _M.access(conf, ctx) elseif cached_text then core.log.info("ai-cache: L2 hit, similarity=", similarity) - local l1_ttl = (conf.exact and conf.exact.ttl) or 3600 - local l1_err = exact.set(conf, scope_hash, prompt_hash, cached_text, l1_ttl) - - if l1_err then - core.log.warn("ai-cache: L2->L1 backfill failed: ", l1_err) + if layer_enabled(conf, "exact") then + local l1_ttl = (conf.exact and conf.exact.ttl) or 3600 + local l1_err = exact.set( + conf, scope_hash, prompt_hash, cached_text, l1_ttl + ) + if l1_err then + core.log.warn("ai-cache: L2->L1 backfill failed: ", l1_err) + end end ctx.ai_cache_status = "HIT-L2" @@ -228,7 +230,7 @@ function _M.log(conf, ctx) return end - -- Early-MISS paths (body parse / protocol detect / hash failure) skip + -- Early-MISS paths (body parse / protocol detect / empty content) skip -- key computation, so bail out if cache key fields are absent. if not ctx.ai_cache_prompt_hash or not ctx.ai_cache_prompt_text then return @@ -258,9 +260,8 @@ function _M.log(conf, ctx) local scope_hash = ctx.ai_cache_scope_hash local prompt_hash = ctx.ai_cache_prompt_hash local embedding = ctx.ai_cache_embedding - local prompt_text = ctx.ai_cache_prompt_text - ngx.timer.at(0, function(premature) + local ok, timer_err = ngx.timer.at(0, function(premature) if premature then return end @@ -268,39 +269,27 @@ function _M.log(conf, ctx) if exact_enabled then local err = exact.set(conf, scope_hash, prompt_hash, response_text, ttl_exact) if err then - ngx.log(ngx.ERR, "ai-cache: failed to write L1 cache: ", err) + ngx.log(ngx.WARN, "ai-cache: failed to write L1 cache: ", err) end end if semantic_enabled then - local vec = embedding - - if not vec then - local emb_conf = conf.semantic.embedding - local emb_driver = require( - "apisix.plugins.ai-cache.embeddings." .. emb_conf.provider - ) - local httpc = http.new() - local emb, _, emb_err = emb_driver.get_embeddings( - emb_conf, prompt_text, httpc, emb_conf.ssl_verify - ) - if not emb then - ngx.log(ngx.WARN, - "ai-cache: failed to get embedding for L2 store: ", emb_err) - return - end - vec = emb + if not embedding then + return end local ttl_semantic = (conf.semantic and conf.semantic.ttl) or 86400 local store_err = semantic.store( - conf, scope_hash, vec, response_text, ttl_semantic + conf, scope_hash, embedding, response_text, ttl_semantic ) if store_err then ngx.log(ngx.WARN, "ai-cache: failed to write L2 cache: ", store_err) end end end) + if not ok then + core.log.warn("ai-cache: failed to schedule cache write: ", timer_err) + end end From 4796a1d1485bef1227c5365f2d881b56534d7a34 Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Tue, 5 May 2026 07:53:54 +0800 Subject: [PATCH 35/38] chore(ai-cache): fix lint --- apisix/plugins/ai-cache/exact.lua | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apisix/plugins/ai-cache/exact.lua b/apisix/plugins/ai-cache/exact.lua index 52f3f98f6f56..e1a63f9a5f41 100644 --- a/apisix/plugins/ai-cache/exact.lua +++ b/apisix/plugins/ai-cache/exact.lua @@ -128,7 +128,7 @@ function _M.set(conf, scope_hash, prompt_hash, text, ttl) end red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool) - + return nil end From 69e8c8e0b5f1ea1480e7cc80c229dfaa4180dfbb Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Tue, 5 May 2026 08:00:47 +0800 Subject: [PATCH 36/38] docs(ai-cache): remove outdated caveats --- docs/en/latest/plugins/ai-cache.md | 8 -------- 1 file changed, 8 deletions(-) diff --git a/docs/en/latest/plugins/ai-cache.md b/docs/en/latest/plugins/ai-cache.md index cec70ae3b675..46a446afd0bb 100644 --- a/docs/en/latest/plugins/ai-cache.md +++ b/docs/en/latest/plugins/ai-cache.md @@ -1073,16 +1073,8 @@ Two prompts that look semantically equivalent to a human can score below the con Lower the threshold to catch more paraphrases at the cost of occasionally serving a cached answer for a genuinely different question. Tune empirically against your traffic. -### Switching embedding models is safe - -The Plugin namespaces the L2 index and entries by embedding dimension (for example `1536` for `text-embedding-3-small`, `3072` for `text-embedding-3-large`), so changing the embedding model on a live route does not require any manual cleanup. A new index is created automatically for the new dimension; old entries from the previous model expire via the configured `semantic.ttl`. - ### `BYPASS` does not refresh the cache A request with the bypass header reaches the upstream but its response is not written back. Use it to force a fresh upstream call without invalidating or replacing the existing cached entry. The bypass header is not authenticated — any client that can set the configured header and value can bypass the cache. In production, gate access using an APISIX authentication plugin such as `key-auth` or `ip-restriction`, or restrict the header at your upstream WAF. - -### The semantic layer requires Redis Stack - -The `FT.CREATE` and `FT.SEARCH` commands used by the semantic layer come from the RediSearch module. Vanilla Redis will fail these commands and the layer will silently degrade to `MISS`. Use a Redis Stack image such as `redis/redis-stack:latest`. From a8843cc10c1f55ddc5cf0f8eb2f07b34001047ee Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Tue, 5 May 2026 08:06:18 +0800 Subject: [PATCH 37/38] chore(ai-cache): fix lint --- t/plugin/ai-cache.t | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t/plugin/ai-cache.t b/t/plugin/ai-cache.t index b8e1557c2cca..abc652328ac8 100644 --- a/t/plugin/ai-cache.t +++ b/t/plugin/ai-cache.t @@ -133,7 +133,7 @@ passed layers = { "semantic" }, redis_host = "127.0.0.1", }) - + if not ok then ngx.say("failed: ", err) else From a1f012841b946660d07a524a3e33771ae441b0ba Mon Sep 17 00:00:00 2001 From: janiussyafiq Date: Wed, 6 May 2026 00:09:33 +0800 Subject: [PATCH 38/38] docs(ai-cache): refresh examples and tighten cache-status callouts --- docs/en/latest/plugins/ai-cache.md | 158 +++++++++++++++++++++++++---- 1 file changed, 136 insertions(+), 22 deletions(-) diff --git a/docs/en/latest/plugins/ai-cache.md b/docs/en/latest/plugins/ai-cache.md index 46a446afd0bb..523b727f836f 100644 --- a/docs/en/latest/plugins/ai-cache.md +++ b/docs/en/latest/plugins/ai-cache.md @@ -36,9 +36,9 @@ import TabItem from '@theme/TabItem'; ## Description -The `ai-cache` Plugin caches LLM responses in Redis so identical or semantically similar prompts are served from cache instead of incurring another upstream call. It supports two cache layers: an exact-match layer (`exact`) keyed by a hash of the prompt, and a semantic layer (`semantic`) that compares prompt embeddings via Redis Stack vector search. Either layer can be enabled independently, and a hit on the semantic layer backfills the exact layer so subsequent identical prompts return immediately. +The `ai-cache` Plugin caches LLM responses in Redis so identical or semantically similar prompts are served from cache instead of incurring another upstream call. It supports two cache layers: an exact-match layer (`exact`) keyed by a hash of the prompt, and a semantic layer (`semantic`) that compares prompt embeddings via Redis Stack vector search. Either layer can be enabled independently, and when both are enabled a hit on the semantic layer backfills the exact layer so subsequent identical prompts return immediately. -The Plugin should be used together with [ai-proxy](./ai-proxy.md) or [ai-proxy-multi](./ai-proxy-multi.md) on the same Route. The semantic layer requires Redis Stack with the RediSearch module and an embedding provider (OpenAI or Azure OpenAI). +The Plugin should be used together with [ai-proxy](./ai-proxy.md) or [ai-proxy-multi](./ai-proxy-multi.md) on the same Route. The semantic layer requires Redis Stack with the RediSearch module and an embedding provider (OpenAI or Azure OpenAI). PRs for additional embedding providers are welcomed. ## Plugin Attributes @@ -283,7 +283,7 @@ curl -i "http://127.0.0.1:9080/anything" -X POST \ }' ``` -The first request reaches OpenAI and you should receive a response similar to the following: +The first request reaches OpenAI. Note the `X-AI-Cache-Status: MISS` header, indicating the prompt was not in cache and APISIX forwarded the request upstream: ```text HTTP/1.1 200 OK @@ -328,7 +328,7 @@ curl -i "http://127.0.0.1:9080/anything" -X POST \ }' ``` -The second request returns from cache without contacting OpenAI. The cached response is replayed from Redis, so the body is shorter and does not contain the original `created`, `model`, `usage`, or `system_fingerprint` fields: +The second request returns from cache without contacting OpenAI. The `X-AI-Cache-Status: HIT-L1` header signals an exact-match hit and `X-AI-Cache-Age` reports the entry's age in seconds. The cached response is replayed from Redis, so the body is shorter and does not contain the original `created`, `model`, `usage`, or `system_fingerprint` fields: ```text HTTP/1.1 200 OK @@ -382,7 +382,7 @@ curl "http://127.0.0.1:9180/apisix/admin/routes/1" -X PUT \ "layers": ["exact", "semantic"], "exact": { "ttl": 3600 }, "semantic": { - "similarity_threshold": 0.92, + "similarity_threshold": 0.85, "ttl": 86400, "embedding": { "provider": "openai", @@ -424,7 +424,7 @@ services: exact: ttl: 3600 semantic: - similarity_threshold: 0.92 + similarity_threshold: 0.85 ttl: 86400 embedding: provider: openai @@ -470,7 +470,7 @@ spec: exact: ttl: 3600 semantic: - similarity_threshold: 0.92 + similarity_threshold: 0.85 ttl: 86400 embedding: provider: openai @@ -538,7 +538,7 @@ spec: exact: ttl: 3600 semantic: - similarity_threshold: 0.92 + similarity_threshold: 0.85 ttl: 86400 embedding: provider: openai @@ -567,12 +567,12 @@ curl -i "http://127.0.0.1:9080/anything" -X POST \ -H "Content-Type: application/json" \ -d '{ "messages": [ - { "role": "user", "content": "What is the capital of France?" } + { "role": "user", "content": "What is the capital city of China?" } ] }' ``` -The first request reaches OpenAI: +The first request reaches OpenAI with `X-AI-Cache-Status: MISS`: ```text HTTP/1.1 200 OK @@ -581,20 +581,20 @@ Server: APISIX/3.16.0 X-AI-Cache-Status: MISS { - "id": "chatcmpl-Da7Iqsqz9gc8Mkf07Hn4NCzAH5Ri1", + "id": "chatcmpl-DcCIDs6ZJisclo84FUk5fT2Ks5vzn", "object": "chat.completion", - "model": "gpt-4o-mini-2024-07-18", + "model": "gpt-4-0613", "choices": [ { "index": 0, "message": { "role": "assistant", - "content": "The capital of France is Paris." + "content": "The capital city of China is Beijing." }, "finish_reason": "stop" } ], - "usage": { "prompt_tokens": 14, "completion_tokens": 7, "total_tokens": 21 } + "usage": { "prompt_tokens": 15, "completion_tokens": 8, "total_tokens": 23 } } ``` @@ -605,28 +605,28 @@ curl -i "http://127.0.0.1:9080/anything" -X POST \ -H "Content-Type: application/json" \ -d '{ "messages": [ - { "role": "user", "content": "capital of France what is?" } + { "role": "user", "content": "Capital city of China?" } ] }' ``` -The semantic layer matches the embedding (cosine similarity above the threshold) and returns the cached response without contacting OpenAI: +The semantic layer matches the embedding (cosine similarity above the threshold) and returns the cached response without contacting OpenAI. The `X-AI-Cache-Status: HIT-L2` header signals a semantic-layer hit and `X-AI-Cache-Similarity` reports the cosine similarity score: ```text HTTP/1.1 200 OK Content-Type: application/json Server: APISIX/3.16.0 X-AI-Cache-Status: HIT-L2 -X-AI-Cache-Similarity: 0.9720680713654 +X-AI-Cache-Similarity: 0.9065774679184 { - "id": "40b612a5-1424-4096-b7ec-8537a1ee6fd3", + "id": "a95488bb-4a51-491a-bd5b-2c1d0e5f8a9b", "object": "chat.completion", "choices": [ { "index": 0, "message": { - "content": "The capital of France is Paris.", + "content": "The capital city of China is Beijing.", "role": "assistant" }, "finish_reason": "stop" @@ -635,7 +635,7 @@ X-AI-Cache-Similarity: 0.9720680713654 } ``` -A semantic-layer hit also backfills the exact layer, so an immediate retry of the same paraphrase returns `X-AI-Cache-Status: HIT-L1`. +When the `exact` layer is also enabled (as in this example), a semantic-layer hit backfills it, so an immediate retry of the same paraphrase returns `X-AI-Cache-Status: HIT-L1`. ### Isolate Cache Entries Per Consumer or Tenant @@ -823,7 +823,121 @@ kubectl apply -f ai-cache-ic.yaml -Two requests with the same prompt but different `X-Tenant-Id` headers each receive `X-AI-Cache-Status: MISS`, because the cache key now includes the tenant identifier. +Send a first request as `tenant-a`: + +```shell +curl -i "http://127.0.0.1:9080/anything" -X POST \ + -H "Content-Type: application/json" \ + -H "X-Tenant-Id: tenant-a" \ + -d '{ + "messages": [ + { "role": "user", "content": "What is the capital city of Japan?" } + ] + }' +``` + +The first request reaches OpenAI with `X-AI-Cache-Status: MISS` and primes `tenant-a`'s cache scope: + +```text +HTTP/1.1 200 OK +Content-Type: application/json +Server: APISIX/3.16.0 +X-AI-Cache-Status: MISS + +{ + "id": "chatcmpl-DcCRAzeSsimIOIeLQWsKtDxMLAAhu", + "object": "chat.completion", + "model": "gpt-4-0613", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "The capital city of Japan is Tokyo." + }, + "finish_reason": "stop" + } + ], + "usage": { "prompt_tokens": 15, "completion_tokens": 8, "total_tokens": 23 } +} +``` + +Repeat the same prompt as `tenant-a`: + +```shell +curl -i "http://127.0.0.1:9080/anything" -X POST \ + -H "Content-Type: application/json" \ + -H "X-Tenant-Id: tenant-a" \ + -d '{ + "messages": [ + { "role": "user", "content": "What is the capital city of Japan?" } + ] + }' +``` + +The second request returns from cache without contacting OpenAI. The `X-AI-Cache-Status: HIT-L1` header confirms `tenant-a`'s entry was reused: + +```text +HTTP/1.1 200 OK +Content-Type: application/json +Server: APISIX/3.16.0 +X-AI-Cache-Status: HIT-L1 +X-AI-Cache-Age: 6 + +{ + "id": "6be4f7a2-83f1-4cdc-8654-cee0396bd4f3", + "object": "chat.completion", + "choices": [ + { + "index": 0, + "message": { + "content": "The capital city of Japan is Tokyo.", + "role": "assistant" + }, + "finish_reason": "stop" + } + ] +} +``` + +Send the same prompt as a different tenant, `tenant-b`: + +```shell +curl -i "http://127.0.0.1:9080/anything" -X POST \ + -H "Content-Type: application/json" \ + -H "X-Tenant-Id: tenant-b" \ + -d '{ + "messages": [ + { "role": "user", "content": "What is the capital city of Japan?" } + ] + }' +``` + +Even though the prompt is identical, the request reaches OpenAI with `X-AI-Cache-Status: MISS` because `tenant-b` has its own cache scope: + +```text +HTTP/1.1 200 OK +Content-Type: application/json +Server: APISIX/3.16.0 +X-AI-Cache-Status: MISS + +{ + "id": "chatcmpl-DcCROH92JLWcgyhSpwEoutTvqnew5", + "object": "chat.completion", + "model": "gpt-4-0613", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "The capital city of Japan is Tokyo." + }, + "finish_reason": "stop" + } + ], + "usage": { "prompt_tokens": 15, "completion_tokens": 8, "total_tokens": 23 } +} +``` ### Bypass the Cache on a Header @@ -1020,7 +1134,7 @@ curl -i "http://127.0.0.1:9080/anything" -X POST \ }' ``` -The request reaches OpenAI even though a cached entry exists, and the response is not written back to cache. You can confirm the upstream was contacted because the response includes the original `created`, `model`, `usage`, and `system_fingerprint` fields: +The request reaches OpenAI even though a cached entry exists, and the response is not written back to cache. The `X-AI-Cache-Status: BYPASS` header confirms the cache was skipped, and the response includes the original `created`, `model`, `usage`, and `system_fingerprint` fields, verifying the upstream was contacted: ```text HTTP/1.1 200 OK