From 69fed98ffa732b9a151f690e27a88b5d3d3991a2 Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Tue, 28 Apr 2026 00:09:39 +0800
Subject: [PATCH 01/38] feat(ai-cache): add plugin skeleton with schema
 definition

---
 apisix/plugins/ai-cache.lua        |  62 ++++++++++
 apisix/plugins/ai-cache/schema.lua | 138 +++++++++++++++++++++
 conf/config.yaml.example           |   1 +
 t/plugin/ai-cache.t                | 192 +++++++++++++++++++++++++++++
 4 files changed, 393 insertions(+)
 create mode 100644 apisix/plugins/ai-cache.lua
 create mode 100644 apisix/plugins/ai-cache/schema.lua
 create mode 100644 t/plugin/ai-cache.t

diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua
new file mode 100644
index 000000000000..4c2d272b7c49
--- /dev/null
+++ b/apisix/plugins/ai-cache.lua
@@ -0,0 +1,62 @@
+--                                                                                                                                     
+-- Licensed to the Apache Software Foundation (ASF) under one or more                                                                  
+-- contributor license agreements.  See the NOTICE file distributed with                                                               
+-- this work for additional information regarding copyright ownership.
+-- The ASF licenses this file to You under the Apache License, Version 2.0                                                             
+-- (the "License"); you may not use this file except in compliance with
+-- the License.  You may obtain a copy of the License at                                                                               
+--
+--     http://www.apache.org/licenses/LICENSE-2.0                                                                                      
+--              
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.                                                            
+-- See the License for the specific language governing permissions and
+-- limitations under the License.                                                                                                      
+--
+
+local core = require("apisix.core")
+local schema = require("apisix.plugins.ai-cache.schema")
+
+local plugin_name = "ai-cache"
+
+local _M = {
+    version = 0.1,
+    priority = 1065,
+    name = plugin_name,
+    schema = schema.schema
+}
+
+function _M.check_schema(conf)
+    local ok, err = core.schema.check(schema.schema, conf)
+    if not ok then
+        return false, err
+    end
+
+    local layers = conf.layers or { "exact", "semantic" }
+    for _, layer in ipairs(layers) do
+        if layer == "semantic" and not (conf.semantic and conf.semantic.embedding) then
+            return false, "semantic layer requires semantic.embedding to be configured"
+        end
+    end
+
+    return true
+end
+
+
+function _M.access(conf, ctx)
+    -- Phase 0 stub: will implement L1/L2 cache lookup in Phase 1
+end
+
+
+function _M.body_filter(conf, ctx)
+    -- Phase 0 stub: will accumulate response chunks in Phase 1
+end
+
+
+function _M.log(conf, ctx)
+    -- Phase 0 stub: will write to cache on 2xx in Phase 1
+end
+
+
+return _M
\ No newline at end of file
diff --git a/apisix/plugins/ai-cache/schema.lua b/apisix/plugins/ai-cache/schema.lua
new file mode 100644
index 000000000000..60f5a4a901b4
--- /dev/null
+++ b/apisix/plugins/ai-cache/schema.lua
@@ -0,0 +1,138 @@
+--                                                                                                                                     
+-- Licensed to the Apache Software Foundation (ASF) under one or more                                                                  
+-- contributor license agreements.  See the NOTICE file distributed with                                                               
+-- this work for additional information regarding copyright ownership.
+-- The ASF licenses this file to You under the Apache License, Version 2.0                                                             
+-- (the "License"); you may not use this file except in compliance with
+-- the License.  You may obtain a copy of the License at                                                                               
+--
+--     http://www.apache.org/licenses/LICENSE-2.0                                                                                      
+--              
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.                                                            
+-- See the License for the specific language governing permissions and
+-- limitations under the License.                                                                                                      
+--
+
+local _M = {}
+
+local embedding_schema = {
+    type = "object",
+    properties = {
+        provider = {
+            type = "string",
+            enum = { "openai", "azure_openai" },
+        },
+        model = { type = "string" },
+        endpoint = { type = "string" },
+        api_key = { type = "string" },
+    },
+    required = { "provider", "endpoint", "api_key" },
+}
+
+local semantic_schema = {
+    type = "object",
+    properties = {
+        similarity_threshold = {
+            type = "number",
+            minimum = 0,
+            maximum = 1,
+            default = 0.95,
+        },
+        top_k = {
+            type = "integer",
+            minimum = 1,
+            default = 1,
+        },
+        ttl = {
+            type = "integer",
+            minimum = 1,
+            default = 86400,
+        },
+        embedding = embedding_schema,
+    },
+    required = { "embedding" },
+}
+
+local exact_schema = {
+    type = "object",
+    properties = {
+        ttl = {
+            type = "integer",
+            minimum = 1,
+            default = 3600,
+        },
+    },
+}
+
+local redis_schema = {
+    type = "object",
+    properties = {
+        host = { type = "string", default = "127.0.0.1" },
+        port = { type = "integer", minimum = 1, maximum = 65535, default = 6379 },
+        password = { type = "string", default = "" },
+        database = { type = "integer", minimum = 0, default = 0 },
+        timeout = { type = "integer", minimum = 1, default = 1000 },
+        ssl = { type = "boolean", default = false },
+        keepalive_timeout = { type = "integer", minimum = 1, default = 60000 },
+        keepalive_pool = { type = "integer", minimum = 1, default = 5 },
+    },
+}
+
+local bypass_item_schema = {
+    type = "object",
+    properties = {
+        header = { type = "string" },
+        equals = { type = "string" },
+    },
+    required = { "header", "equals" },
+}
+
+local headers_schema = {
+    type = "object",
+    properties = {
+        cache_status = { type = "string", default = "X-AI-Cache-Status" },
+        cache_similarity = { type = "string", default = "X-AI-Cache-Similarity" },
+        cache_age = { type = "string", default = "X-AI-Cache-Age" },
+    },
+}
+
+_M.schema = {
+    type = "object",
+    properties = {
+        layers = {
+            type = "array",
+            items = { type = "string", enum = { "exact", "semantic" } },
+            uniqueItems = true,
+            default = { "exact", "semantic" },
+        },
+        cache_key = {
+            type = "object",
+            properties = {
+                include_consumer = {type = "boolean", default = false },
+                include_vars = {
+                    type = "array",
+                    items = { type = "string" },
+                    default = {},
+                },
+            },
+        },
+        exact = exact_schema,
+        semantic = semantic_schema,
+        redis = redis_schema,
+        bypass_on = {
+            type = "array",
+            items = bypass_item_schema,
+        },
+        headers = headers_schema,
+        max_cache_body_size = {
+            type = "integer",
+            minimum = 1,
+            default = 1048576,
+        },
+    },
+    encrypt_fields = { "semantic.embedding.api_key", "redis.password" },
+}
+
+return _M
\ No newline at end of file
diff --git a/conf/config.yaml.example b/conf/config.yaml.example
index ae7155a86b06..901774540d70 100644
--- a/conf/config.yaml.example
+++ b/conf/config.yaml.example
@@ -514,6 +514,7 @@ plugins:                           # plugin list (sorted by priority)
   - ai-prompt-template             # priority: 1071
   - ai-prompt-decorator            # priority: 1070
   - ai-prompt-guard                # priority: 1072
+  - ai-cache                       # priority: 1065
   - ai-rag                         # priority: 1060
   - ai-aws-content-moderation      # priority: 1050
   - ai-proxy-multi                 # priority: 1041
diff --git a/t/plugin/ai-cache.t b/t/plugin/ai-cache.t
new file mode 100644
index 000000000000..fdeee9cd33d9
--- /dev/null
+++ b/t/plugin/ai-cache.t
@@ -0,0 +1,192 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more                                                                   
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0                                                              
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at                                                                                
+#               
+#     http://www.apache.org/licenses/LICENSE-2.0
+#                                                                                                                                      
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,                                                                    
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.                                                                                                       
+#
+
+BEGIN {
+    $ENV{TEST_ENABLE_CONTROL_API_V1} = "0";
+}
+
+use t::APISIX 'no_plan';
+
+log_level("info");
+repeat_each(1);
+no_long_string();
+no_root_location();
+
+add_block_preprocessor(sub {
+    my ($block) = @_;
+
+    if (!defined $block->request) {
+        $block->set_value("request", "GET /t");
+    }
+});
+
+run_tests();
+
+__DATA__
+
+=== TEST 1: valid config - exact layer only
+--- config
+    location /t {
+        content_by_lua_block {
+            local plugin = require("apisix.plugins.ai-cache")
+            local ok, err = plugin.check_schema({
+                layers = { "exact" },
+                exact = { ttl = 600 },
+                redis = {
+                    host = "127.0.0.1",
+                    port = 6379,
+                }
+            })
+
+            if not ok then
+                ngx.say("failed")
+            else
+                ngx.say("passed")
+            end 
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 2: valid config - both layers with semantic embedding
+--- config
+    location /t {
+        content_by_lua_block {
+            local plugin = require("apisix.plugins.ai-cache")
+            local ok, err = plugin.check_schema({
+                layers = { "exact", "semantic" },
+                exact = { ttl = 3600 },
+                semantic = {
+                    similarity_threshold = 0.95,
+                    ttl = 86400,
+                    embedding = {
+                        provider = "openai",
+                        endpoint = "https://api.openai.com/v1/embeddings",
+                        api_key = "sk-test",
+                    },
+                },
+                redis = { host = "127.0.0.1", port = 6379 },
+            })
+
+            if not ok then
+                ngx.say(err)
+            else
+                ngx.say("passed")
+            end
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 3: semantic without embedding config - should fail
+--- config
+    location /t {
+        content_by_lua_block {
+            local plugin = require("apisix.plugins.ai-cache")
+            local ok, err = plugin.check_schema({
+                layers = { "semantic" },
+            })
+            if not ok then
+                ngx.say("failed: ", err)
+            else
+                ngx.say("passed")
+            end
+        }
+    }
+--- response_body
+failed: semantic layer requires semantic.embedding to be configured
+
+
+
+=== TEST 4: invalid layer value - should fail
+--- config
+    location /t {
+        content_by_lua_block {
+            local plugin = require("apisix.plugins.ai-cache")
+            local ok, err = plugin.check_schema({
+                layers = { "invalid_layer" },
+            })
+            if not ok then
+                ngx.say("failed")
+            else
+                ngx.say("passed")
+            end
+        }
+    }
+--- response_body
+failed
+
+
+
+=== TEST 5: unsupported embedding provider - should fail
+--- config
+    location /t {
+        content_by_lua_block {
+            local plugin = require("apisix.plugins.ai-cache")
+            local ok, err = plugin.check_schema({
+                layers = { "semantic" },
+                semantic = {
+                    embedding = {
+                        provider = "some-unknown-provider",
+                        endpoint = "https://example.com/embeddings",
+                        api_key = "key",
+                    },
+                },
+            })
+
+            if not ok then
+                ngx.say("failed")
+            else
+                ngx.say("passed")
+            end
+        }
+    }
+--- response_body
+failed
+
+
+
+=== TEST 6: similarity_threshold out of range - should fail
+--- config
+    location /t {
+        content_by_lua_block {
+            local plugin = require("apisix.plugins.ai-cache")
+            local ok, err = plugin.check_schema({
+                layers = { "semantic" },
+                semantic = {
+                    similarity_threshold = 1.5,
+                    embedding = {
+                        provider = "openai",
+                        endpoint = "https://api.openai.com/v1/embeddings",
+                        api_key = "sk-test",
+                    },
+                },
+            })
+
+            if not ok then
+                ngx.say("failed")
+            else
+                ngx.say("passed")
+            end
+        }
+    }
+--- response_body
+failed

From aea4028dedaadaa730fd112f54f7d4ad1cefd0fe Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Tue, 28 Apr 2026 05:19:34 +0800
Subject: [PATCH 02/38] feat(ai-cache): implement L1 exact cache with
 hash-based Redis lookup

---
 apisix/plugins/ai-cache.lua        | 138 ++++++++++++++++++++-
 apisix/plugins/ai-cache/exact.lua  | 138 +++++++++++++++++++++
 apisix/plugins/ai-cache/schema.lua |  19 +--
 t/plugin/ai-cache.t                | 193 ++++++++++++++++++++++++++++-
 4 files changed, 463 insertions(+), 25 deletions(-)
 create mode 100644 apisix/plugins/ai-cache/exact.lua

diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua
index 4c2d272b7c49..071763f27828 100644
--- a/apisix/plugins/ai-cache.lua
+++ b/apisix/plugins/ai-cache.lua
@@ -15,8 +15,13 @@
 -- limitations under the License.                                                                                                      
 --
 
-local core = require("apisix.core")
-local schema = require("apisix.plugins.ai-cache.schema")
+local core      = require("apisix.core")
+local schema    = require("apisix.plugins.ai-cache.schema")
+local exact     = require("apisix.plugins.ai-cache.exact")
+
+local table_concat  = table.concat
+local ngx_time      = ngx.time
+local tostring      = tostring
 
 local plugin_name = "ai-cache"
 
@@ -45,17 +50,140 @@ end
 
 
 function _M.access(conf, ctx)
-    -- Phase 0 stub: will implement L1/L2 cache lookup in Phase 1
+    -- Check bypass_on conditions
+    if conf.bypass_on then
+        local req_headers = ngx.req.get_headers()
+        for _, rule in ipairs(conf.bypass_on) do
+            if req_headers[rule.header] == rule.equals then
+                ctx.ai_cache_bypass = true
+                ctx.ai_cache_status = "BYPASS"
+                return
+            end
+        end
+    end
+
+    -- Read and parse request body
+    local body_tab, err = core.request.get_json_request_body_table()
+    if not body_tab then
+        core.log.warn("ai-cache: failed to read request body: ", err or "unknown error")
+        ctx.ai_cache_miss   = true
+        ctx.ai_cache_status = "MISS"
+        return
+    end
+
+    local messages = body_tab.messages
+    if not messages then
+        ctx.ai_cache_miss   = true
+        ctx.ai_cache_status = "MISS"
+        return
+    end
+
+    -- Compute cache key components
+    local scope_hash = exact.compute_scope_hash(conf, ctx)
+    local prompt_hash, err = exact.compute_prompt_hash(messages)
+    if not prompt_hash then
+        core.log.warn("ai-cache: failed to compute prompt hash: ", err)
+        ctx.ai_cache_miss   = true
+        ctx.ai_cache_status = "MISS"
+        return
+    end
+
+    -- L1 exact lookup
+    local layers = conf.layers or { "exact", "semantic" }
+    local exact_enabled = false
+    for _, l in ipairs(layers) do
+        if l == "exact" then
+            exact_enabled = true
+            break
+        end
+    end
+
+    if exact_enabled then
+        local cached_body, written_at, lookup_err = exact.get(conf, scope_hash, prompt_hash)
+        if lookup_err then
+            core.log.warn("ai-cache: L1 lookup error: ", lookup_err)
+        elseif cached_body then
+            core.log.info("ai-cache: L1 hit for key ", prompt_hash)
+            ctx.ai_cache_status     = "HIT-L1"
+            ctx.ai_cache_written_at = written_at
+            return core.response.exit(200, cached_body)
+        end
+    end
+
+    -- MISS - store context for body_filter and log phases
+    ctx.ai_cache_miss        = true
+    ctx.ai_cache_status      = "MISS"
+    ctx.ai_cache_scope_hash  = scope_hash
+    ctx.ai_cache_prompt_hash = prompt_hash
+end
+
+
+function _M.header_filter(conf, ctx)
+    if not ctx.ai_cache_status then
+        return
+    end
+
+    local status_header = (conf.headers and conf.headers.cache_status)
+                            or "X-AI-Cache-Status"
+    ngx.header[status_header] = ctx.ai_cache_status
+
+    if ctx.ai_cache_status == "HIT-L1" and ctx.ai_cache_written_at then
+        local age_header = (conf.headers and conf.headers.cache_age)
+                            or "X-AI-Cache-Age"
+        ngx.header[age_header] = tostring(ngx_time() - ctx.ai_cache_written_at)
+    end
 end
 
 
 function _M.body_filter(conf, ctx)
-    -- Phase 0 stub: will accumulate response chunks in Phase 1
+    if not ctx.ai_cache_miss then
+        return
+    end
+
+    local chunk = ngx.arg[1]
+
+    if type(chunk) == "string" and chunk ~= "" then
+        if not ctx.ai_cache_body_chunks then
+            ctx.ai_cache_body_chunks = {}
+        end
+        local chunks = ctx.ai_cache_body_chunks
+        chunks[#chunks + 1] = chunk
+    end
 end
 
 
 function _M.log(conf, ctx)
-    -- Phase 0 stub: will write to cache on 2xx in Phase 1
+    if not ctx.ai_cache_miss or ctx.ai_cache_bypass then
+        return
+    end
+
+    local status = core.response.get_upstream_status(ctx) or ngx.status
+    if not status or status < 200 or status >= 300 then
+        return
+    end
+
+    if not ctx.ai_cache_body_chunks then
+        return
+    end
+
+    local body = table_concat(ctx.ai_cache_body_chunks)
+    local max_size = conf.max_cache_body_size or 1048576
+    if #body > max_size then
+        core.log.warn("ai-cache: response body exceeds max_cache_body_size, skipping write")
+        return
+    end
+
+    local ttl          = (conf.exact and conf.exact.ttl) or 3600
+    local scope_hash   = ctx.ai_cache_scope_hash
+    local prompt_hash  = ctx.ai_cache_prompt_hash
+
+    ngx.timer.at(0, function(premature)
+        if premature then return end
+        local err = exact.set(conf, scope_hash, prompt_hash, body, ttl)
+        if err then
+            ngx.log(ngx.ERR, "ai-cache: failed to write L1 cache: ", err)
+        end
+    end)
 end
 
 
diff --git a/apisix/plugins/ai-cache/exact.lua b/apisix/plugins/ai-cache/exact.lua
new file mode 100644
index 000000000000..56d9d98f16ca
--- /dev/null
+++ b/apisix/plugins/ai-cache/exact.lua
@@ -0,0 +1,138 @@
+--                                                                                                                                     
+-- Licensed to the Apache Software Foundation (ASF) under one or more                                                                  
+-- contributor license agreements.  See the NOTICE file distributed with                                                               
+-- this work for additional information regarding copyright ownership.
+-- The ASF licenses this file to You under the Apache License, Version 2.0                                                             
+-- (the "License"); you may not use this file except in compliance with
+-- the License.  You may obtain a copy of the License at                                                                               
+--
+--     http://www.apache.org/licenses/LICENSE-2.0                                                                                      
+--              
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.                                                            
+-- See the License for the specific language governing permissions and
+-- limitations under the License.                                                                                                      
+--
+
+local core         = require("apisix.core")
+local redis        = require("apisix.utils.redis")
+local resty_sha256 = require("resty.sha256")
+local to_hex       = require("resty.string").to_hex
+
+local table_concat  = table.concat
+local table_sort    = table.sort
+local ngx_time      = ngx.time
+local tostring      = tostring
+
+local KEY_PREFIX = "ai-cache:l1:"
+
+local _M = {}
+
+
+local function sha256_hex(s)
+    local hash = resty_sha256:new()
+    hash:update(s)
+    return to_hex(hash:final())
+end
+
+_M.sha256_hex = sha256_hex
+
+function _M.compute_scope_hash(conf, ctx)
+    local cache_key = conf.cache_key
+    if not cache_key then
+        return ""
+    end
+
+    local parts = {}
+    local n = 0
+
+    if cache_key.include_consumer then
+        n = n + 1
+        parts[n] = ctx.consumer_name or ""
+    end
+
+    if cache_key.include_vars then
+        for _, var_name in ipairs(cache_key.include_vars) do
+            local key = var_name
+            if key:sub(1, 1) == "$" then
+                key = key:sub(2)
+            end
+            n = n + 1
+            parts[n] = tostring(ctx.var[key] or "")
+        end
+    end
+
+    if n == 0 then
+        return ""
+    end
+
+    table_sort(parts)
+    return sha256_hex(table_concat(parts, "|"))
+end
+
+
+function _M.compute_prompt_hash(messages)
+    local encoded, err = core.json.encode(messages)
+    if not encoded then
+        return nil, err
+    end
+    return sha256_hex(encoded), nil
+end
+
+
+function _M.get(conf, scope_hash, prompt_hash)
+    local red, err = redis.new(conf)
+    if not red then
+        return nil, nil, err
+    end
+
+    local key = KEY_PREFIX .. scope_hash .. ":" .. prompt_hash
+    local res, err = red:get(key)
+    red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool)
+
+    if err then
+        return nil, nil, err
+    end
+
+    if res == ngx.null then
+        return nil, nil, nil
+    end
+
+    local entry, decode_err = core.json.decode(res)
+    if not entry then
+        return nil, nil, "corrupt cache entry: " .. decode_err
+    end
+
+    return entry.body, entry.written_at, nil
+end
+
+
+function _M.set(conf, scope_hash, prompt_hash, body, ttl)
+    local red, err = redis.new(conf)
+    if not red then
+        return err
+    end
+
+    local key = KEY_PREFIX .. scope_hash .. ":" .. prompt_hash
+    local entry, encode_err = core.json.encode({
+        body = body,
+        written_at = ngx_time(),
+    })
+
+    if not entry then
+        red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool)
+        return encode_err
+    end
+
+    local ok, err = red:set(key, entry, "EX", ttl)
+    red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool)
+
+    if not ok then
+        return err
+    end
+    return nil
+end
+
+
+return _M
diff --git a/apisix/plugins/ai-cache/schema.lua b/apisix/plugins/ai-cache/schema.lua
index 60f5a4a901b4..76878d4c028b 100644
--- a/apisix/plugins/ai-cache/schema.lua
+++ b/apisix/plugins/ai-cache/schema.lua
@@ -15,6 +15,8 @@
 -- limitations under the License.                                                                                                      
 --
 
+local redis_schema = require("apisix.utils.redis-schema")
+
 local _M = {}
 
 local embedding_schema = {
@@ -66,19 +68,6 @@ local exact_schema = {
     },
 }
 
-local redis_schema = {
-    type = "object",
-    properties = {
-        host = { type = "string", default = "127.0.0.1" },
-        port = { type = "integer", minimum = 1, maximum = 65535, default = 6379 },
-        password = { type = "string", default = "" },
-        database = { type = "integer", minimum = 0, default = 0 },
-        timeout = { type = "integer", minimum = 1, default = 1000 },
-        ssl = { type = "boolean", default = false },
-        keepalive_timeout = { type = "integer", minimum = 1, default = 60000 },
-        keepalive_pool = { type = "integer", minimum = 1, default = 5 },
-    },
-}
 
 local bypass_item_schema = {
     type = "object",
@@ -120,7 +109,6 @@ _M.schema = {
         },
         exact = exact_schema,
         semantic = semantic_schema,
-        redis = redis_schema,
         bypass_on = {
             type = "array",
             items = bypass_item_schema,
@@ -132,7 +120,8 @@ _M.schema = {
             default = 1048576,
         },
     },
-    encrypt_fields = { "semantic.embedding.api_key", "redis.password" },
+    allOf = { redis_schema.schema.redis },
+    encrypt_fields = { "semantic.embedding.api_key", "redis_password" },
 }
 
 return _M
\ No newline at end of file
diff --git a/t/plugin/ai-cache.t b/t/plugin/ai-cache.t
index fdeee9cd33d9..56fa2ee9e3b0 100644
--- a/t/plugin/ai-cache.t
+++ b/t/plugin/ai-cache.t
@@ -24,6 +24,7 @@ use t::APISIX 'no_plan';
 log_level("info");
 repeat_each(1);
 no_long_string();
+no_shuffle();
 no_root_location();
 
 add_block_preprocessor(sub {
@@ -32,6 +33,36 @@ add_block_preprocessor(sub {
     if (!defined $block->request) {
         $block->set_value("request", "GET /t");
     }
+
+    if (!$block->error_log && !$block->no_error_log) {
+        $block->set_value("no_error_log", "[error]\n[alert]");
+    }
+
+    if (!defined $block->http_config) {
+        $block->set_value("http_config", <<_EOC_);
+server {
+    server_name llm;
+    listen 1990;
+    default_type 'application/json';
+
+    location / {
+        content_by_lua_block {
+            ngx.status = 200
+            ngx.header["Content-Type"] = "application/json"
+            ngx.say('{"id":"chatcmpl-test","choices":[{"message":{"role":"assistant","content":"The answer is 42"}}]}')
+        }
+    }
+
+    location /error {
+        content_by_lua_block {
+            ngx.status = 400
+            ngx.header["Content-Type"] = "application/json"
+            ngx.say('{"error":{"message":"bad request","type":"invalid_request_error"}}')
+        }
+    }
+}
+_EOC_
+    }
 });
 
 run_tests();
@@ -46,10 +77,8 @@ __DATA__
             local ok, err = plugin.check_schema({
                 layers = { "exact" },
                 exact = { ttl = 600 },
-                redis = {
-                    host = "127.0.0.1",
-                    port = 6379,
-                }
+                redis_host = "127.0.0.1",
+                redis_port = 6379,
             })
 
             if not ok then
@@ -81,7 +110,8 @@ passed
                         api_key = "sk-test",
                     },
                 },
-                redis = { host = "127.0.0.1", port = 6379 },
+                redis_host = "127.0.0.1",
+                redis_port = 6379,
             })
 
             if not ok then
@@ -103,6 +133,7 @@ passed
             local plugin = require("apisix.plugins.ai-cache")
             local ok, err = plugin.check_schema({
                 layers = { "semantic" },
+                redis_host = "127.0.0.1",
             })
             if not ok then
                 ngx.say("failed: ", err)
@@ -190,3 +221,155 @@ failed
     }
 --- response_body
 failed
+
+
+
+=== TEST 7: set up route for L1 cache tests
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/1',
+                ngx.HTTP_PUT,
+                [[{
+                    "uri": "/chat",
+                    "plugins": {
+                        "ai-cache": {
+                            "layers": ["exact"],
+                            "exact": { "ttl": 60 },
+                            "redis_host": "127.0.0.1",
+                            "bypass_on": [{"header": "X-Cache-Bypass", "equals": "1"}]
+                        }
+                    },
+                    "upstream": {
+                        "type": "roundrobin",
+                        "nodes": {
+                            "127.0.0.1:1990": 1
+                        }
+                    }
+                }]]
+            )
+
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.say(body)
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 8: first request - cache MISS, upstream called
+--- request
+POST /chat
+{"messages":[{"role":"user","content":"What is the answer to life?"}]}
+--- more_headers
+Content-Type: application/json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- response_body
+{"id":"chatcmpl-test","choices":[{"message":{"role":"assistant","content":"The answer is 42"}}]}
+
+
+
+=== TEST 9: second identical request - cache HIT-L1, no upstream call
+--- request
+POST /chat
+{"messages":[{"role":"user","content":"What is the answer to life?"}]}
+--- more_headers
+Content-Type: application/json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L1
+--- response_body
+{"id":"chatcmpl-test","choices":[{"message":{"role":"assistant","content":"The answer is 42"}}]}
+--- error_log
+ai-cache: L1 hit for key
+
+
+
+=== TEST 10: bypass header - BYPASS, upstream called, not cached
+--- request
+POST /chat
+{"messages":[{"role":"user","content":"What is the bypass question?"}]}
+--- more_headers
+Content-Type: application/json
+X-Cache-Bypass: 1
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: BYPASS
+
+
+
+=== TEST 11: same prompt without bypass after bypass - still MISS (bypass did not cache)
+--- request
+POST /chat
+{"messages":[{"role":"user","content":"What is the bypass question?"}]}
+--- more_headers
+Content-Type: application/json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+
+
+
+=== TEST 12: set up route for 4xx test
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/2',
+                ngx.HTTP_PUT,
+                [[{
+                    "uri": "/error",
+                    "plugins": {
+                        "ai-cache": {
+                            "layers": ["exact"],
+                            "exact": { "ttl": 60 },
+                            "redis_host": "127.0.0.1"
+                        }
+                    },
+                    "upstream": {
+                        "type": "roundrobin",
+                        "nodes": {
+                            "127.0.0.1:1990": 1
+                        }
+                    }
+                }]]
+            )
+
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.say(body)
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 13: 4xx from upstream - not cached
+--- request
+POST /error
+{"messages":[{"role":"user","content":"trigger an error please"}]}
+--- more_headers
+Content-Type: application/json
+--- error_code: 400
+--- response_headers
+X-AI-Cache-Status: MISS
+
+
+
+=== TEST 14: same prompt after 4xx - still MISS (4xx was not cached)
+--- request
+POST /error
+{"messages":[{"role":"user","content":"trigger an error please"}]}
+--- more_headers
+Content-Type: application/json
+--- error_code: 400
+--- response_headers
+X-AI-Cache-Status: MISS

From f323e04277b150907650e817f9761d3e13ec0cd9 Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Tue, 28 Apr 2026 06:12:40 +0800
Subject: [PATCH 03/38] feat(ai-cache): add embedding drivers for OpenAI and
 Azure OpenAI

---
 apisix/plugins/ai-cache.lua                   |   3 +-
 .../ai-cache/embeddings/azure_openai.lua      |  68 +++++++
 apisix/plugins/ai-cache/embeddings/openai.lua |  71 +++++++
 t/plugin/ai-cache.t                           | 191 ++++++++++++++++++
 4 files changed, 332 insertions(+), 1 deletion(-)
 create mode 100644 apisix/plugins/ai-cache/embeddings/azure_openai.lua
 create mode 100644 apisix/plugins/ai-cache/embeddings/openai.lua

diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua
index 071763f27828..d4e13ac84146 100644
--- a/apisix/plugins/ai-cache.lua
+++ b/apisix/plugins/ai-cache.lua
@@ -18,7 +18,6 @@
 local core      = require("apisix.core")
 local schema    = require("apisix.plugins.ai-cache.schema")
 local exact     = require("apisix.plugins.ai-cache.exact")
-
 local table_concat  = table.concat
 local ngx_time      = ngx.time
 local tostring      = tostring
@@ -45,6 +44,8 @@ function _M.check_schema(conf)
         end
     end
 
+    core.utils.check_https({ "semantic.embedding.endpoint" }, conf, plugin_name)
+
     return true
 end
 
diff --git a/apisix/plugins/ai-cache/embeddings/azure_openai.lua b/apisix/plugins/ai-cache/embeddings/azure_openai.lua
new file mode 100644
index 000000000000..1ce13ceefeab
--- /dev/null
+++ b/apisix/plugins/ai-cache/embeddings/azure_openai.lua
@@ -0,0 +1,68 @@
+--                                                                                                                                     
+-- Licensed to the Apache Software Foundation (ASF) under one or more                                                                  
+-- contributor license agreements.  See the NOTICE file distributed with                                                               
+-- this work for additional information regarding copyright ownership.
+-- The ASF licenses this file to You under the Apache License, Version 2.0                                                             
+-- (the "License"); you may not use this file except in compliance with
+-- the License.  You may obtain a copy of the License at                                                                               
+--
+--     http://www.apache.org/licenses/LICENSE-2.0                                                                                      
+--              
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.                                                            
+-- See the License for the specific language governing permissions and
+-- limitations under the License.                                                                                                      
+--
+
+local core = require("apisix.core")
+
+local HTTP_OK = ngx.HTTP_OK
+local HTTP_INTERNAL_SERVER_ERROR = ngx.HTTP_INTERNAL_SERVER_ERROR
+
+local _M = {}
+
+
+function _M.get_embeddings(conf, text, httpc, ssl_verify)
+    local body, err = core.json.encode({ input = text })
+    if not body then
+        return nil, HTTP_INTERNAL_SERVER_ERROR, err
+    end
+
+    local res, err = httpc:request_uri(conf.endpoint, {
+        method = "POST",
+        headers = {
+            ["Content-Type"] = "application/json",
+            ["api-key"] = conf.api_key,
+        },
+        body = body,
+        ssl_verify = ssl_verify,
+    })
+
+    if not res or not res.body then
+        return nil, HTTP_INTERNAL_SERVER_ERROR, err or "no response from embeddings API"
+    end
+
+    if res.status ~= HTTP_OK then
+        return nil, res.status, res.body
+    end
+
+    local res_tab, err = core.json.decode(res.body)
+    if not res_tab then
+        return nil, HTTP_INTERNAL_SERVER_ERROR, err
+    end
+
+    if type(res_tab.data) ~= "table" or core.table.isempty(res_tab.data) then
+        return nil, HTTP_INTERNAL_SERVER_ERROR, "unexpected embedding response: " .. res.body
+    end
+
+    local embedding = res_tab.data[1].embedding
+    if type(embedding) ~= "table" then
+        return nil, HTTP_INTERNAL_SERVER_ERROR, "missing embedding field in response"
+    end
+
+    return embedding, nil, nil
+end
+
+
+return _M
diff --git a/apisix/plugins/ai-cache/embeddings/openai.lua b/apisix/plugins/ai-cache/embeddings/openai.lua
new file mode 100644
index 000000000000..60f65d6a9777
--- /dev/null
+++ b/apisix/plugins/ai-cache/embeddings/openai.lua
@@ -0,0 +1,71 @@
+--                                                                                                                                     
+-- Licensed to the Apache Software Foundation (ASF) under one or more                                                                  
+-- contributor license agreements.  See the NOTICE file distributed with                                                               
+-- this work for additional information regarding copyright ownership.
+-- The ASF licenses this file to You under the Apache License, Version 2.0                                                             
+-- (the "License"); you may not use this file except in compliance with
+-- the License.  You may obtain a copy of the License at                                                                               
+--
+--     http://www.apache.org/licenses/LICENSE-2.0                                                                                      
+--              
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.                                                            
+-- See the License for the specific language governing permissions and
+-- limitations under the License.                                                                                                      
+--
+
+local core = require("apisix.core")
+
+local HTTP_OK = ngx.HTTP_OK
+local HTTP_INTERNAL_SERVER_ERROR = ngx.HTTP_INTERNAL_SERVER_ERROR
+
+local _M = {}
+
+
+function _M.get_embeddings(conf, text, httpc, ssl_verify)
+    local body, err = core.json.encode({
+        input = text,
+        model = conf.model or "text-embedding-3-small",
+    })
+    if not body then
+        return nil, HTTP_INTERNAL_SERVER_ERROR, err
+    end
+
+    local res, err = httpc:request_uri(conf.endpoint, {
+        method = "POST",
+        headers = {
+            ["Content-Type"] = "application/json",
+            ["Authorization"] = "Bearer " .. conf.api_key,
+        },
+        body = body,
+        ssl_verify = ssl_verify,
+    })
+
+    if not res or not res.body then
+        return nil, HTTP_INTERNAL_SERVER_ERROR, err or "no response from embeddings API"
+    end
+
+    if res.status ~= HTTP_OK then
+        return nil, res.status, res.body
+    end
+
+    local res_tab, err = core.json.decode(res.body)
+    if not res_tab then
+        return nil, HTTP_INTERNAL_SERVER_ERROR, err
+    end
+
+    if type(res_tab.data) ~= "table" or core.table.isempty(res_tab.data) then
+        return nil, HTTP_INTERNAL_SERVER_ERROR, "unexpected embedding response: " .. res.body
+    end
+
+    local embedding = res_tab.data[1].embedding
+    if type(embedding) ~= "table" then
+        return nil, HTTP_INTERNAL_SERVER_ERROR, "missing embedding field in response"
+    end
+
+    return embedding, nil, nil
+end
+
+
+return _M
diff --git a/t/plugin/ai-cache.t b/t/plugin/ai-cache.t
index 56fa2ee9e3b0..ae3a7e8ba3a3 100644
--- a/t/plugin/ai-cache.t
+++ b/t/plugin/ai-cache.t
@@ -373,3 +373,194 @@ Content-Type: application/json
 --- error_code: 400
 --- response_headers
 X-AI-Cache-Status: MISS
+
+
+
+=== TEST 15: openai driver - parses embedding vector correctly
+--- http_config
+server {
+    listen 1991;
+    default_type 'application/json';
+
+    location /v1/embeddings {
+        content_by_lua_block {
+            local cjson = require("cjson.safe")
+            ngx.req.read_body()
+            local body = cjson.decode(ngx.req.get_body_data())
+
+            if ngx.req.get_headers()["Authorization"] ~= "Bearer test-key" then
+                ngx.status = 401
+                ngx.say('{"error":"unauthorized"}')
+                return
+            end
+
+            ngx.status = 200
+            ngx.say(cjson.encode({
+                data = {
+                    { embedding = {0.1, 0.2, 0.3}, index = 0, object = "embedding" }
+                },
+                model = body.model,
+                object = "list"
+            }))
+        }
+    }
+}
+--- config
+    location /t {
+        content_by_lua_block {
+            local http = require("resty.http")
+            local driver = require("apisix.plugins.ai-cache.embeddings.openai")
+
+            local httpc = http.new()
+            local conf = {
+                endpoint = "http://127.0.0.1:1991/v1/embeddings",
+                api_key = "test-key",
+                model = "text-embedding-3-small",
+            }
+
+            local embedding, status, err = driver.get_embeddings(conf, "hello world", httpc, false)
+            if not embedding then
+                ngx.say("error: ", err)
+                return
+            end
+
+            if #embedding ~= 3 then
+                ngx.say("wrong length: ", #embedding)
+                return
+            end
+
+            ngx.say("ok: ", embedding[1], " ", embedding[2], " ", embedding[3])
+        }
+    }
+--- response_body
+ok: 0.1 0.2 0.3
+
+
+
+=== TEST 16: openai driver - 429 from API return nil with status
+--- http_config
+server {
+    listen 1991;
+    default_type 'application/json';
+
+    location /v1/embeddings {
+        content_by_lua_block {
+            ngx.status = 429
+            ngx.say('{"error":{"message":"rate limit exceeded","type":"requests"}}')
+        }
+    }
+}
+--- config
+    location /t {
+        content_by_lua_block {
+            local http = require("resty.http")
+            local driver = require("apisix.plugins.ai-cache.embeddings.openai")
+
+            local httpc = http.new()
+            local conf = {
+                endpoint = "http://127.0.0.1:1991/v1/embeddings",
+                api_key = "test-key",
+            }
+
+            local embedding, status, err = driver.get_embeddings(conf, "hello", httpc, false)
+            if embedding then
+                ngx.say("unexpected success")
+                return
+            end
+
+            ngx.say("status: ", status)
+        }
+    }
+--- response_body
+status: 429
+
+
+
+=== TEST 17: azure_openai driver - parses embedding vector correctly
+--- http_config
+server {
+    listen 1991;
+    default_type 'application/json';
+
+    location /embeddings {
+        content_by_lua_block {
+            local cjson = require("cjson.safe")
+
+            if ngx.req.get_headers()["api-key"] ~= "azure-test-key" then
+                ngx.status = 401
+                ngx.say('{"error":"unauthorized"}')
+                return
+            end
+
+            ngx.status = 200
+            ngx.say(cjson.encode({
+                data = {
+                    { embedding = {0.4, 0.5, 0.6}, index = 0, object = "embedding" }
+                },
+                object = "list"
+            }))
+        }
+    }
+}
+--- config
+    location /t {
+        content_by_lua_block {
+            local http = require("resty.http")
+            local driver = require("apisix.plugins.ai-cache.embeddings.azure_openai")
+
+            local httpc = http.new()
+            local conf = {
+                endpoint = "http://127.0.0.1:1991/embeddings",
+                api_key = "azure-test-key",
+            }
+
+            local embedding, status, err = driver.get_embeddings(conf, "hello world", httpc, false)
+            if not embedding then
+                ngx.say("error: ", err)
+                return
+            end
+
+            ngx.say("ok: ", embedding[1], " ", embedding[2], " ", embedding[3])
+        }
+    }
+--- response_body
+ok: 0.4 0.5 0.6
+
+
+
+=== TEST 18: openai driver - 500 from API returns nil with status
+--- http_config
+server {
+    listen 1991;
+    default_type 'application/json';
+
+    location /v1/embeddings {
+        content_by_lua_block {
+            ngx.status = 500
+            ngx.say('{"error":{"message":"internal server error"}}')
+        }
+    }
+}
+--- config
+    location /t {
+        content_by_lua_block {
+            local http = require("resty.http")
+            local driver = require("apisix.plugins.ai-cache.embeddings.openai")
+
+            local httpc = http.new()
+            local conf = {
+                endpoint = "http://127.0.0.1:1991/v1/embeddings",
+                api_key = "test-key",
+            }
+
+            local embedding, status, err = driver.get_embeddings(conf, "hello", httpc, false)
+            if embedding then
+                ngx.say("unexpected success")
+                return
+            end
+
+            ngx.say("status: ", status)
+        }
+    }
+--- response_body
+status: 500

From eafce29c93153ac10560fcb78d51e5458147f94e Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Tue, 28 Apr 2026 06:19:56 +0800
Subject: [PATCH 04/38] chore: fix lint errors

---
 apisix/plugins/ai-cache.lua                    | 18 +++++++++---------
 .../ai-cache/embeddings/azure_openai.lua       | 18 +++++++++---------
 apisix/plugins/ai-cache/embeddings/openai.lua  | 18 +++++++++---------
 apisix/plugins/ai-cache/exact.lua              | 18 +++++++++---------
 apisix/plugins/ai-cache/schema.lua             | 18 +++++++++---------
 5 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua
index d4e13ac84146..9cb4f7b53243 100644
--- a/apisix/plugins/ai-cache.lua
+++ b/apisix/plugins/ai-cache.lua
@@ -1,18 +1,18 @@
---                                                                                                                                     
--- Licensed to the Apache Software Foundation (ASF) under one or more                                                                  
--- contributor license agreements.  See the NOTICE file distributed with                                                               
+--
+-- Licensed to the Apache Software Foundation (ASF) under one or more
+-- contributor license agreements.  See the NOTICE file distributed with
 -- this work for additional information regarding copyright ownership.
--- The ASF licenses this file to You under the Apache License, Version 2.0                                                             
+-- The ASF licenses this file to You under the Apache License, Version 2.0
 -- (the "License"); you may not use this file except in compliance with
--- the License.  You may obtain a copy of the License at                                                                               
+-- the License.  You may obtain a copy of the License at
+--
+--     http://www.apache.org/licenses/LICENSE-2.0
 --
---     http://www.apache.org/licenses/LICENSE-2.0                                                                                      
---              
 -- Unless required by applicable law or agreed to in writing, software
 -- distributed under the License is distributed on an "AS IS" BASIS,
--- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.                                                            
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 -- See the License for the specific language governing permissions and
--- limitations under the License.                                                                                                      
+-- limitations under the License.
 --
 
 local core      = require("apisix.core")
diff --git a/apisix/plugins/ai-cache/embeddings/azure_openai.lua b/apisix/plugins/ai-cache/embeddings/azure_openai.lua
index 1ce13ceefeab..a52c5e265497 100644
--- a/apisix/plugins/ai-cache/embeddings/azure_openai.lua
+++ b/apisix/plugins/ai-cache/embeddings/azure_openai.lua
@@ -1,18 +1,18 @@
---                                                                                                                                     
--- Licensed to the Apache Software Foundation (ASF) under one or more                                                                  
--- contributor license agreements.  See the NOTICE file distributed with                                                               
+--
+-- Licensed to the Apache Software Foundation (ASF) under one or more
+-- contributor license agreements.  See the NOTICE file distributed with
 -- this work for additional information regarding copyright ownership.
--- The ASF licenses this file to You under the Apache License, Version 2.0                                                             
+-- The ASF licenses this file to You under the Apache License, Version 2.0
 -- (the "License"); you may not use this file except in compliance with
--- the License.  You may obtain a copy of the License at                                                                               
+-- the License.  You may obtain a copy of the License at
+--
+--     http://www.apache.org/licenses/LICENSE-2.0
 --
---     http://www.apache.org/licenses/LICENSE-2.0                                                                                      
---              
 -- Unless required by applicable law or agreed to in writing, software
 -- distributed under the License is distributed on an "AS IS" BASIS,
--- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.                                                            
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 -- See the License for the specific language governing permissions and
--- limitations under the License.                                                                                                      
+-- limitations under the License.
 --
 
 local core = require("apisix.core")
diff --git a/apisix/plugins/ai-cache/embeddings/openai.lua b/apisix/plugins/ai-cache/embeddings/openai.lua
index 60f65d6a9777..ff50f2bbea27 100644
--- a/apisix/plugins/ai-cache/embeddings/openai.lua
+++ b/apisix/plugins/ai-cache/embeddings/openai.lua
@@ -1,18 +1,18 @@
---                                                                                                                                     
--- Licensed to the Apache Software Foundation (ASF) under one or more                                                                  
--- contributor license agreements.  See the NOTICE file distributed with                                                               
+--
+-- Licensed to the Apache Software Foundation (ASF) under one or more
+-- contributor license agreements.  See the NOTICE file distributed with
 -- this work for additional information regarding copyright ownership.
--- The ASF licenses this file to You under the Apache License, Version 2.0                                                             
+-- The ASF licenses this file to You under the Apache License, Version 2.0
 -- (the "License"); you may not use this file except in compliance with
--- the License.  You may obtain a copy of the License at                                                                               
+-- the License.  You may obtain a copy of the License at
+--
+--     http://www.apache.org/licenses/LICENSE-2.0
 --
---     http://www.apache.org/licenses/LICENSE-2.0                                                                                      
---              
 -- Unless required by applicable law or agreed to in writing, software
 -- distributed under the License is distributed on an "AS IS" BASIS,
--- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.                                                            
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 -- See the License for the specific language governing permissions and
--- limitations under the License.                                                                                                      
+-- limitations under the License.
 --
 
 local core = require("apisix.core")
diff --git a/apisix/plugins/ai-cache/exact.lua b/apisix/plugins/ai-cache/exact.lua
index 56d9d98f16ca..5b42ebcb413e 100644
--- a/apisix/plugins/ai-cache/exact.lua
+++ b/apisix/plugins/ai-cache/exact.lua
@@ -1,18 +1,18 @@
---                                                                                                                                     
--- Licensed to the Apache Software Foundation (ASF) under one or more                                                                  
--- contributor license agreements.  See the NOTICE file distributed with                                                               
+--
+-- Licensed to the Apache Software Foundation (ASF) under one or more
+-- contributor license agreements.  See the NOTICE file distributed with
 -- this work for additional information regarding copyright ownership.
--- The ASF licenses this file to You under the Apache License, Version 2.0                                                             
+-- The ASF licenses this file to You under the Apache License, Version 2.0
 -- (the "License"); you may not use this file except in compliance with
--- the License.  You may obtain a copy of the License at                                                                               
+-- the License.  You may obtain a copy of the License at
+--
+--     http://www.apache.org/licenses/LICENSE-2.0
 --
---     http://www.apache.org/licenses/LICENSE-2.0                                                                                      
---              
 -- Unless required by applicable law or agreed to in writing, software
 -- distributed under the License is distributed on an "AS IS" BASIS,
--- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.                                                            
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 -- See the License for the specific language governing permissions and
--- limitations under the License.                                                                                                      
+-- limitations under the License.
 --
 
 local core         = require("apisix.core")
diff --git a/apisix/plugins/ai-cache/schema.lua b/apisix/plugins/ai-cache/schema.lua
index 76878d4c028b..444858cc9067 100644
--- a/apisix/plugins/ai-cache/schema.lua
+++ b/apisix/plugins/ai-cache/schema.lua
@@ -1,18 +1,18 @@
---                                                                                                                                     
--- Licensed to the Apache Software Foundation (ASF) under one or more                                                                  
--- contributor license agreements.  See the NOTICE file distributed with                                                               
+--
+-- Licensed to the Apache Software Foundation (ASF) under one or more
+-- contributor license agreements.  See the NOTICE file distributed with
 -- this work for additional information regarding copyright ownership.
--- The ASF licenses this file to You under the Apache License, Version 2.0                                                             
+-- The ASF licenses this file to You under the Apache License, Version 2.0
 -- (the "License"); you may not use this file except in compliance with
--- the License.  You may obtain a copy of the License at                                                                               
+-- the License.  You may obtain a copy of the License at
+--
+--     http://www.apache.org/licenses/LICENSE-2.0
 --
---     http://www.apache.org/licenses/LICENSE-2.0                                                                                      
---              
 -- Unless required by applicable law or agreed to in writing, software
 -- distributed under the License is distributed on an "AS IS" BASIS,
--- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.                                                            
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 -- See the License for the specific language governing permissions and
--- limitations under the License.                                                                                                      
+-- limitations under the License.
 --
 
 local redis_schema = require("apisix.utils.redis-schema")

From 09e46927345a067e1cc6e6776016c9d8d9446932 Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Wed, 29 Apr 2026 00:13:42 +0800
Subject: [PATCH 05/38]  refactor(ai-cache): use ai-protocols for
 protocol-agnostic caching

---
 apisix/plugins/ai-cache.lua       | 86 ++++++++++++++-----------------
 apisix/plugins/ai-cache/exact.lua | 14 ++---
 t/plugin/ai-cache.t               | 75 ++++++++++++---------------
 3 files changed, 78 insertions(+), 97 deletions(-)

diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua
index 9cb4f7b53243..5c5e1cb53ff8 100644
--- a/apisix/plugins/ai-cache.lua
+++ b/apisix/plugins/ai-cache.lua
@@ -18,9 +18,10 @@
 local core      = require("apisix.core")
 local schema    = require("apisix.plugins.ai-cache.schema")
 local exact     = require("apisix.plugins.ai-cache.exact")
-local table_concat  = table.concat
-local ngx_time      = ngx.time
-local tostring      = tostring
+local protocols = require("apisix.plugins.ai-protocols")
+local ngx_time  = ngx.time
+local tostring  = tostring
+local table_concat = table.concat
 
 local plugin_name = "ai-cache"
 
@@ -63,7 +64,6 @@ function _M.access(conf, ctx)
         end
     end
 
-    -- Read and parse request body
     local body_tab, err = core.request.get_json_request_body_table()
     if not body_tab then
         core.log.warn("ai-cache: failed to read request body: ", err or "unknown error")
@@ -72,24 +72,32 @@ function _M.access(conf, ctx)
         return
     end
 
-    local messages = body_tab.messages
-    if not messages then
+    local protocol_name = protocols.detect(body_tab, ctx)
+    if not protocol_name then
+        core.log.warn("ai-cache: could not detect AI protocol, skipping cache")
+        ctx.ai_cache_miss = true
+        ctx.ai_cache_status = "MISS"
+        return
+    end
+
+    local proto = protocols.get(protocol_name)
+    local contents = proto.extract_request_content(body_tab)
+    if not contents or #contents == 0 then
         ctx.ai_cache_miss   = true
         ctx.ai_cache_status = "MISS"
         return
     end
 
-    -- Compute cache key components
+    local prompt_text = table_concat(contents, " ")
     local scope_hash = exact.compute_scope_hash(conf, ctx)
-    local prompt_hash, err = exact.compute_prompt_hash(messages)
+    local prompt_hash, hash_err = exact.compute_prompt_hash(prompt_text)
     if not prompt_hash then
-        core.log.warn("ai-cache: failed to compute prompt hash: ", err)
+        core.log.warn("ai-cache: failed to compute prompt hash: ", hash_err)
         ctx.ai_cache_miss   = true
         ctx.ai_cache_status = "MISS"
         return
     end
 
-    -- L1 exact lookup
     local layers = conf.layers or { "exact", "semantic" }
     local exact_enabled = false
     for _, l in ipairs(layers) do
@@ -100,22 +108,26 @@ function _M.access(conf, ctx)
     end
 
     if exact_enabled then
-        local cached_body, written_at, lookup_err = exact.get(conf, scope_hash, prompt_hash)
+        local cached_text, written_at, lookup_err = exact.get(conf, scope_hash, prompt_hash)
         if lookup_err then
             core.log.warn("ai-cache: L1 lookup error: ", lookup_err)
-        elseif cached_body then
+        elseif cached_text then
             core.log.info("ai-cache: L1 hit for key ", prompt_hash)
-            ctx.ai_cache_status     = "HIT-L1"
+            ctx.ai_cache_status = "HIT-L1"
             ctx.ai_cache_written_at = written_at
-            return core.response.exit(200, cached_body)
+            local is_stream = body_tab.stream == true
+            return core.response.exit(200, proto.build_deny_response({
+                stream = is_stream,
+                text = cached_text,
+            }))
         end
     end
 
-    -- MISS - store context for body_filter and log phases
-    ctx.ai_cache_miss        = true
-    ctx.ai_cache_status      = "MISS"
+    ctx.ai_cache_miss   = true
+    ctx.ai_cache_status = "MISS"
     ctx.ai_cache_scope_hash  = scope_hash
     ctx.ai_cache_prompt_hash = prompt_hash
+    ctx.ai_cache_prompt_text = prompt_text
 end
 
 
@@ -136,23 +148,6 @@ function _M.header_filter(conf, ctx)
 end
 
 
-function _M.body_filter(conf, ctx)
-    if not ctx.ai_cache_miss then
-        return
-    end
-
-    local chunk = ngx.arg[1]
-
-    if type(chunk) == "string" and chunk ~= "" then
-        if not ctx.ai_cache_body_chunks then
-            ctx.ai_cache_body_chunks = {}
-        end
-        local chunks = ctx.ai_cache_body_chunks
-        chunks[#chunks + 1] = chunk
-    end
-end
-
-
 function _M.log(conf, ctx)
     if not ctx.ai_cache_miss or ctx.ai_cache_bypass then
         return
@@ -163,24 +158,21 @@ function _M.log(conf, ctx)
         return
     end
 
-    if not ctx.ai_cache_body_chunks then
+    local response_text = ctx.var.llm_response_text
+    if not response_text or response_text == "" then
         return
     end
 
-    local body = table_concat(ctx.ai_cache_body_chunks)
-    local max_size = conf.max_cache_body_size or 1048576
-    if #body > max_size then
-        core.log.warn("ai-cache: response body exceeds max_cache_body_size, skipping write")
-        return
-    end
-
-    local ttl          = (conf.exact and conf.exact.ttl) or 3600
-    local scope_hash   = ctx.ai_cache_scope_hash
-    local prompt_hash  = ctx.ai_cache_prompt_hash
+    local ttl = (conf.exact and conf.exact.ttl) or 3600
+    local scope_hash = ctx.ai_cache_scope_hash
+    local prompt_hash = ctx.ai_cache_prompt_hash
 
     ngx.timer.at(0, function(premature)
-        if premature then return end
-        local err = exact.set(conf, scope_hash, prompt_hash, body, ttl)
+        if premature then
+            return
+        end
+    
+        local err = exact.set(conf, scope_hash, prompt_hash, response_text, ttl)
         if err then
             ngx.log(ngx.ERR, "ai-cache: failed to write L1 cache: ", err)
         end
diff --git a/apisix/plugins/ai-cache/exact.lua b/apisix/plugins/ai-cache/exact.lua
index 5b42ebcb413e..4b4d36b157a6 100644
--- a/apisix/plugins/ai-cache/exact.lua
+++ b/apisix/plugins/ai-cache/exact.lua
@@ -72,12 +72,8 @@ function _M.compute_scope_hash(conf, ctx)
 end
 
 
-function _M.compute_prompt_hash(messages)
-    local encoded, err = core.json.encode(messages)
-    if not encoded then
-        return nil, err
-    end
-    return sha256_hex(encoded), nil
+function _M.compute_prompt_hash(text)
+    return sha256_hex(text), nil
 end
 
 
@@ -104,11 +100,11 @@ function _M.get(conf, scope_hash, prompt_hash)
         return nil, nil, "corrupt cache entry: " .. decode_err
     end
 
-    return entry.body, entry.written_at, nil
+    return entry.text, entry.written_at, nil
 end
 
 
-function _M.set(conf, scope_hash, prompt_hash, body, ttl)
+function _M.set(conf, scope_hash, prompt_hash, text, ttl)
     local red, err = redis.new(conf)
     if not red then
         return err
@@ -116,7 +112,7 @@ function _M.set(conf, scope_hash, prompt_hash, body, ttl)
 
     local key = KEY_PREFIX .. scope_hash .. ":" .. prompt_hash
     local entry, encode_err = core.json.encode({
-        body = body,
+        text = text,
         written_at = ngx_time(),
     })
 
diff --git a/t/plugin/ai-cache.t b/t/plugin/ai-cache.t
index ae3a7e8ba3a3..a375d50bd9ca 100644
--- a/t/plugin/ai-cache.t
+++ b/t/plugin/ai-cache.t
@@ -38,31 +38,6 @@ add_block_preprocessor(sub {
         $block->set_value("no_error_log", "[error]\n[alert]");
     }
 
-    if (!defined $block->http_config) {
-        $block->set_value("http_config", <<_EOC_);
-server {
-    server_name llm;
-    listen 1990;
-    default_type 'application/json';
-
-    location / {
-        content_by_lua_block {
-            ngx.status = 200
-            ngx.header["Content-Type"] = "application/json"
-            ngx.say('{"id":"chatcmpl-test","choices":[{"message":{"role":"assistant","content":"The answer is 42"}}]}')
-        }
-    }
-
-    location /error {
-        content_by_lua_block {
-            ngx.status = 400
-            ngx.header["Content-Type"] = "application/json"
-            ngx.say('{"error":{"message":"bad request","type":"invalid_request_error"}}')
-        }
-    }
-}
-_EOC_
-    }
 });
 
 run_tests();
@@ -234,18 +209,23 @@ failed
                 [[{
                     "uri": "/chat",
                     "plugins": {
+                        "ai-proxy": {
+                            "provider": "openai",
+                            "auth": {
+                                "header": {
+                                    "Authorization": "Bearer test-key"
+                                }
+                            },
+                            "override": {
+                                "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+                            }
+                        },
                         "ai-cache": {
                             "layers": ["exact"],
                             "exact": { "ttl": 60 },
                             "redis_host": "127.0.0.1",
                             "bypass_on": [{"header": "X-Cache-Bypass", "equals": "1"}]
                         }
-                    },
-                    "upstream": {
-                        "type": "roundrobin",
-                        "nodes": {
-                            "127.0.0.1:1990": 1
-                        }
                     }
                 }]]
             )
@@ -267,11 +247,12 @@ POST /chat
 {"messages":[{"role":"user","content":"What is the answer to life?"}]}
 --- more_headers
 Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
 --- error_code: 200
 --- response_headers
 X-AI-Cache-Status: MISS
---- response_body
-{"id":"chatcmpl-test","choices":[{"message":{"role":"assistant","content":"The answer is 42"}}]}
+--- response_body_like eval
+qr/content/
 
 
 
@@ -281,11 +262,12 @@ POST /chat
 {"messages":[{"role":"user","content":"What is the answer to life?"}]}
 --- more_headers
 Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
 --- error_code: 200
 --- response_headers
 X-AI-Cache-Status: HIT-L1
---- response_body
-{"id":"chatcmpl-test","choices":[{"message":{"role":"assistant","content":"The answer is 42"}}]}
+--- response_body_like eval
+qr/content/
 --- error_log
 ai-cache: L1 hit for key
 
@@ -297,6 +279,7 @@ POST /chat
 {"messages":[{"role":"user","content":"What is the bypass question?"}]}
 --- more_headers
 Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
 X-Cache-Bypass: 1
 --- error_code: 200
 --- response_headers
@@ -310,6 +293,7 @@ POST /chat
 {"messages":[{"role":"user","content":"What is the bypass question?"}]}
 --- more_headers
 Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
 --- error_code: 200
 --- response_headers
 X-AI-Cache-Status: MISS
@@ -326,17 +310,22 @@ X-AI-Cache-Status: MISS
                 [[{
                     "uri": "/error",
                     "plugins": {
+                        "ai-proxy": {
+                            "provider": "openai",
+                            "auth": {
+                                "header": {
+                                    "Authorization": "Bearer test-key"
+                                }
+                            },
+                            "override": {
+                                "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+                            }
+                        },
                         "ai-cache": {
                             "layers": ["exact"],
                             "exact": { "ttl": 60 },
                             "redis_host": "127.0.0.1"
                         }
-                    },
-                    "upstream": {
-                        "type": "roundrobin",
-                        "nodes": {
-                            "127.0.0.1:1990": 1
-                        }
                     }
                 }]]
             )
@@ -358,6 +347,8 @@ POST /error
 {"messages":[{"role":"user","content":"trigger an error please"}]}
 --- more_headers
 Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+X-AI-Fixture-Status: 400
 --- error_code: 400
 --- response_headers
 X-AI-Cache-Status: MISS
@@ -370,6 +361,8 @@ POST /error
 {"messages":[{"role":"user","content":"trigger an error please"}]}
 --- more_headers
 Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+X-AI-Fixture-Status: 400
 --- error_code: 400
 --- response_headers
 X-AI-Cache-Status: MISS

From a899f6ace764b922d54c882d18a6821b300b72eb Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Wed, 29 Apr 2026 09:18:10 +0800
Subject: [PATCH 06/38] feat(ai-cache): implement L2 semantic cache with Redis
 Stack KNN search

---
 apisix/plugins/ai-cache.lua          | 122 +++++++++++++++---
 apisix/plugins/ai-cache/semantic.lua | 173 +++++++++++++++++++++++++
 t/plugin/ai-cache.t                  | 184 +++++++++++++++++++++++++--
 3 files changed, 452 insertions(+), 27 deletions(-)
 create mode 100644 apisix/plugins/ai-cache/semantic.lua

diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua
index 5c5e1cb53ff8..216291692ca9 100644
--- a/apisix/plugins/ai-cache.lua
+++ b/apisix/plugins/ai-cache.lua
@@ -18,7 +18,9 @@
 local core      = require("apisix.core")
 local schema    = require("apisix.plugins.ai-cache.schema")
 local exact     = require("apisix.plugins.ai-cache.exact")
+local semantic  = require("apisix.plugins.ai-cache.semantic")
 local protocols = require("apisix.plugins.ai-protocols")
+local http      = require("resty.http")
 local ngx_time  = ngx.time
 local tostring  = tostring
 local table_concat = table.concat
@@ -32,15 +34,24 @@ local _M = {
     schema = schema.schema
 }
 
+
+local function layer_enabled(conf, name)
+    local layers = conf.layers or { "exact", "semantic" }
+    for _, l in ipairs(layers) do
+        if l == name then return true end
+    end
+    return false
+end
+
+
 function _M.check_schema(conf)
     local ok, err = core.schema.check(schema.schema, conf)
     if not ok then
         return false, err
     end
 
-    local layers = conf.layers or { "exact", "semantic" }
-    for _, layer in ipairs(layers) do
-        if layer == "semantic" and not (conf.semantic and conf.semantic.embedding) then
+    if layer_enabled(conf, "semantic") then
+        if not (conf.semantic and conf.semantic.embedding) then
             return false, "semantic layer requires semantic.embedding to be configured"
         end
     end
@@ -98,21 +109,13 @@ function _M.access(conf, ctx)
         return
     end
 
-    local layers = conf.layers or { "exact", "semantic" }
-    local exact_enabled = false
-    for _, l in ipairs(layers) do
-        if l == "exact" then
-            exact_enabled = true
-            break
-        end
-    end
-
-    if exact_enabled then
+    -- L1 exact lookup
+    if layer_enabled(conf, "exact") then
         local cached_text, written_at, lookup_err = exact.get(conf, scope_hash, prompt_hash)
         if lookup_err then
             core.log.warn("ai-cache: L1 lookup error: ", lookup_err)
         elseif cached_text then
-            core.log.info("ai-cache: L1 hit for key ", prompt_hash)
+            core.log.info("ai-cache: L1 hit for key: ", prompt_hash)
             ctx.ai_cache_status = "HIT-L1"
             ctx.ai_cache_written_at = written_at
             local is_stream = body_tab.stream == true
@@ -123,6 +126,46 @@ function _M.access(conf, ctx)
         end
     end
 
+    -- L2 semantic lookup
+    if layer_enabled(conf, "semantic") then
+        local emb_conf = conf.semantic.embedding
+        local emb_driver = require("apisix.plugins.ai-cache.embeddings." .. emb_conf.provider)
+        local httpc = http.new()
+
+        local embedding, _, emb_err = emb_driver.get_embeddings(emb_conf, prompt_text, httpc, true)
+        if not embedding then
+            core.log.warn("ai-cache: embedding fetch failed (degrading to MISS): ", emb_err)
+        else
+            ctx.ai_cache_embedding = embedding
+
+            local threshold = conf.semantic.similarity_threshold or 0.95
+            local cached_text, similarity, search_err = semantic.search(
+                conf, scope_hash, embedding, threshold
+            )
+
+            if search_err then
+                core.log.warn("ai-cache: L2 search error (degrading to MISS): ", search_err)
+            elseif cached_text then
+                core.log.info("ai-cache: L2 hit, similarity=", similarity)
+
+                local l1_ttl = (conf.exact and conf.exact.ttl) or 3600
+                local l1_err = exact.set(conf, scope_hash, prompt_hash, cached_text, l1_ttl)
+
+                if l1_err then
+                    core.log.warn("ai-cache: L2->L1 backfill failed: ", l1_err)
+                end
+
+                ctx.ai_cache_status = "HIT-L2"
+                ctx.ai_cache_similarity = similarity
+                local is_stream = body_tab.stream == true
+                return core.response.exit(200, proto.build_deny_response({
+                    stream = is_stream,
+                    text = cached_text,
+                }))
+            end
+        end
+    end
+
     ctx.ai_cache_miss   = true
     ctx.ai_cache_status = "MISS"
     ctx.ai_cache_scope_hash  = scope_hash
@@ -145,6 +188,12 @@ function _M.header_filter(conf, ctx)
                             or "X-AI-Cache-Age"
         ngx.header[age_header] = tostring(ngx_time() - ctx.ai_cache_written_at)
     end
+
+    if ctx.ai_cache_status == "HIT-L2" and ctx.ai_cache_similarity then
+        local sim_header = (conf.headers and conf.headers.cache_similarity)
+                            or "X-AI-Cache-Similarity"
+        ngx.header[sim_header] = tostring(ctx.ai_cache_similarity)
+    end
 end
 
 
@@ -163,21 +212,56 @@ function _M.log(conf, ctx)
         return
     end
 
-    local ttl = (conf.exact and conf.exact.ttl) or 3600
+    local exact_enabled = layer_enabled(conf, "exact")
+    local semantic_enabled = layer_enabled(conf, "semantic")
+    local ttl_exact = (conf.exact and conf.exact.ttl) or 3600
     local scope_hash = ctx.ai_cache_scope_hash
     local prompt_hash = ctx.ai_cache_prompt_hash
+    local embedding = ctx.ai_cache_embedding
+    local prompt_text = ctx.ai_cache_prompt_text
 
     ngx.timer.at(0, function(premature)
         if premature then
             return
         end
     
-        local err = exact.set(conf, scope_hash, prompt_hash, response_text, ttl)
-        if err then
-            ngx.log(ngx.ERR, "ai-cache: failed to write L1 cache: ", err)
+        if exact_enabled then
+            local err = exact.set(conf, scope_hash, prompt_hash, response_text, ttl_exact)
+            if err then
+                ngx.log(ngx.ERR, "ai-cache: failed to write L1 cache: ", err)
+            end
+        end
+
+        if semantic_enabled then
+            local vec = embedding
+
+            if not vec then
+                local emb_conf = conf.semantic.embedding
+                local emb_driver = require(
+                    "apisix.plugins.ai-cache.embeddings." .. emb_conf.provider
+                )
+                local httpc = http.new()
+                local emb, _, emb_err = emb_driver.get_embeddings(
+                    emb_conf, prompt_text, httpc, true
+                )
+                if not emb then
+                    ngx.log(ngx.WARN,
+                        "ai-cache: failed to get embedding for L2 store: ", emb_err)
+                    return
+                end
+                vec = emb
+            end
+
+            local ttl_semantic = (conf.semantic and conf.semantic.ttl) or 86400
+            local store_err = semantic.store(
+                conf, scope_hash, vec, response_text, ttl_semantic
+            )
+            if store_err then
+                ngx.log(ngx.WARN, "ai-cache: failed to write L2 cache: ", store_err)
+            end
         end
     end)
 end
 
 
-return _M
\ No newline at end of file
+return _M
diff --git a/apisix/plugins/ai-cache/semantic.lua b/apisix/plugins/ai-cache/semantic.lua
new file mode 100644
index 000000000000..38ace604a617
--- /dev/null
+++ b/apisix/plugins/ai-cache/semantic.lua
@@ -0,0 +1,173 @@
+--
+-- Licensed to the Apache Software Foundation (ASF) under one or more
+-- contributor license agreements.  See the NOTICE file distributed with
+-- this work for additional information regarding copyright ownership.
+-- The ASF licenses this file to You under the Apache License, Version 2.0
+-- (the "License"); you may not use this file except in compliance with
+-- the License.  You may obtain a copy of the License at
+--
+--     http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-- See the License for the specific language governing permissions and
+-- limitations under the License.
+--
+
+local apisix_redis = require("apisix.utils.redis")
+local uuid = require("resty.jit-uuid")
+local ffi = require("ffi")
+
+local ffi_new = ffi.new
+local ffi_string = ffi.string
+local ngx_time = ngx.time
+local tostring = tostring
+local tonumber = tonumber
+local type = type
+
+local INDEX_NAME = "ai-cache-idx"
+local KEY_PREFIX = "ai-cache:l2:"
+
+local _M = {}
+
+local function pack_vector(vec)
+    local n = #vec
+    local buf = ffi_new("float[?]", n)
+    for i = 0, n - 1 do
+        buf[i] = vec[i + 1]
+    end
+    return ffi_string(buf, n * 4)
+end
+
+local index_ready = false
+
+local function ensure_index(red, dim)
+    if index_ready then
+        return true
+    end
+
+    local _, err = red["FT.CREATE"](red,
+        INDEX_NAME,
+        "ON", "HASH",
+        "PREFIX", "1", KEY_PREFIX,
+        "SCHEMA",
+        "embedding", "VECTOR", "HNSW", "6",
+        "TYPE", "FLOAT32",
+        "DIM", tostring(dim),
+        "DISTANCE_METRIC", "COSINE",
+        "scope", "TAG",
+        "created_at", "NUMERIC"
+    )
+
+    if err and not err:find("already exists") then
+        return nil, "FT.CREATE failed: " .. err
+    end
+
+    index_ready = true
+    return true
+end
+
+
+function _M.search(conf, scope_hash, embedding_vec, threshold)
+    local red, err = apisix_redis.new(conf)
+    if not red then
+        return nil, nil, err
+    end
+
+    local ok, init_err = ensure_index(red, #embedding_vec)
+    if not ok then
+        red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool)
+        return nil, nil, init_err
+    end
+
+    local binary_vec = pack_vector(embedding_vec)
+
+    local query
+    if scope_hash == "" then
+        query = "*=>[KNN 1 @embedding $vec AS dist]"
+    else
+        query = "@scope:{" .. scope_hash .. "} *=>[KNN 1 @embedding $vec AS dist]"
+    end
+
+    local res, search_err = red["FT.SEARCH"](red,
+        INDEX_NAME,
+        query,
+        "PARAMS", "2", "vec", binary_vec,
+        "SORTBY", "dist", "ASC",
+        "LIMIT", "0", "1",
+        "RETURN", "2", "response", "dist",
+        "DIALECT", "2"
+    )
+    red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool)
+
+    if search_err then
+        return nil, nil, search_err
+    end
+
+    if not res or res[1] == 0 then
+        return nil, nil, nil
+    end
+
+    -- RESP2: {count, key, {field, val, field, val, ...}, ...}
+    local fields = res[3]
+    if type(fields) ~= "table" then
+        return nil, nil, nil
+    end
+
+    local response_text, dist
+    for i = 1, #fields, 2 do
+        if fields[i] == "response" then
+            response_text = fields[i + 1]
+        elseif fields[i] == "dist" then
+            dist = tonumber(fields[i + 1])
+        end
+    end
+
+    if not response_text or not dist then
+        return nil, nil, nil
+    end
+
+    local similarity = 1 - dist
+    if similarity < threshold then
+        return nil, nil, nil
+    end
+
+    return response_text, similarity, nil
+end
+
+
+function _M.store(conf, scope_hash, embedding_vec, text, ttl)
+    local red, err = apisix_redis.new(conf)
+    if not red then
+        return err
+    end
+
+    local ok, init_err = ensure_index(red, #embedding_vec)
+    if not ok then
+        red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool)
+        return init_err
+    end
+
+    local binary_vec = pack_vector(embedding_vec)
+    local key = KEY_PREFIX .. uuid.generate_v4()
+
+    local set_ok, set_err = red:hset(key,
+        "embedding", binary_vec,
+        "response", text,
+        "scope", scope_hash,
+        "created_at", tostring(ngx_time())
+    )
+
+    if not set_ok then
+        red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool)
+        return set_err
+    end
+
+    red:expire(key, ttl)
+    red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool)
+    return nil
+end
+
+
+return _M
diff --git a/t/plugin/ai-cache.t b/t/plugin/ai-cache.t
index a375d50bd9ca..f25379413c40 100644
--- a/t/plugin/ai-cache.t
+++ b/t/plugin/ai-cache.t
@@ -38,6 +38,29 @@ add_block_preprocessor(sub {
         $block->set_value("no_error_log", "[error]\n[alert]");
     }
 
+    if (!defined $block->http_config) {
+        $block->set_value("http_config", <<_EOC_);
+server {
+    listen 1990;
+    default_type 'application/json';
+
+    location /v1/embeddings {
+        content_by_lua_block {
+            local fixture_loader = require("lib.fixture_loader")
+            local content, err = fixture_loader.load("openai/embeddings-list.json")
+            if not content then
+                ngx.status = 500
+                ngx.say(err)
+                return
+            end
+
+            ngx.status = 200
+            ngx.print(content)
+        }
+    }
+}
+_EOC_
+    }
 });
 
 run_tests();
@@ -372,7 +395,7 @@ X-AI-Cache-Status: MISS
 === TEST 15: openai driver - parses embedding vector correctly
 --- http_config
 server {
-    listen 1991;
+    listen 1990;
     default_type 'application/json';
 
     location /v1/embeddings {
@@ -406,7 +429,7 @@ server {
 
             local httpc = http.new()
             local conf = {
-                endpoint = "http://127.0.0.1:1991/v1/embeddings",
+                endpoint = "http://127.0.0.1:1990/v1/embeddings",
                 api_key = "test-key",
                 model = "text-embedding-3-small",
             }
@@ -433,7 +456,7 @@ ok: 0.1 0.2 0.3
 === TEST 16: openai driver - 429 from API return nil with status
 --- http_config
 server {
-    listen 1991;
+    listen 1990;
     default_type 'application/json';
 
     location /v1/embeddings {
@@ -451,7 +474,7 @@ server {
 
             local httpc = http.new()
             local conf = {
-                endpoint = "http://127.0.0.1:1991/v1/embeddings",
+                endpoint = "http://127.0.0.1:1990/v1/embeddings",
                 api_key = "test-key",
             }
 
@@ -472,7 +495,7 @@ status: 429
 === TEST 17: azure_openai driver - parses embedding vector correctly
 --- http_config
 server {
-    listen 1991;
+    listen 1990;
     default_type 'application/json';
 
     location /embeddings {
@@ -503,7 +526,7 @@ server {
 
             local httpc = http.new()
             local conf = {
-                endpoint = "http://127.0.0.1:1991/embeddings",
+                endpoint = "http://127.0.0.1:1990/embeddings",
                 api_key = "azure-test-key",
             }
 
@@ -524,7 +547,7 @@ ok: 0.4 0.5 0.6
 === TEST 18: openai driver - 500 from API returns nil with status
 --- http_config
 server {
-    listen 1991;
+    listen 1990;
     default_type 'application/json';
 
     location /v1/embeddings {
@@ -542,7 +565,7 @@ server {
 
             local httpc = http.new()
             local conf = {
-                endpoint = "http://127.0.0.1:1991/v1/embeddings",
+                endpoint = "http://127.0.0.1:1990/v1/embeddings",
                 api_key = "test-key",
             }
 
@@ -557,3 +580,148 @@ server {
     }
 --- response_body
 status: 500
+
+
+
+=== TEST 19: clean up L2 state before semantic tests
+--- config
+    location /t {
+        content_by_lua_block {
+            local redis = require("resty.redis")
+            local red = redis:new()
+            red:set_timeout(1000)
+            assert(red:connect("127.0.0.1", 6379))
+
+            red["FT.DROPINDEX"](red, "ai-cache-idx", "DD")
+
+            local keys = red:keys("ai-cache:*")
+            if type(keys) == "table" and #keys > 0 then
+                red:del(unpack(keys))
+            end
+
+            red:close()
+            ngx.say("ok")
+        }
+    }
+--- response_body
+ok
+
+
+
+=== TEST 20: set up route for L2 semantic cache tests
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/3',
+                ngx.HTTP_PUT,
+                [[{
+                    "uri": "/semantic",
+                    "plugins": {
+                        "ai-proxy": {
+                            "provider": "openai",
+                            "auth": {
+                                "header": {
+                                    "Authorization": "Bearer test-key"
+                                }
+                            },
+                            "override": {
+                                "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+                            }
+                        },
+                        "ai-cache": {
+                            "layers": ["exact", "semantic"],
+                            "exact": {
+                                "ttl": 60
+                            },
+                            "semantic": {
+                                "similarity_threshold": 0.90,
+                                "ttl": 300,
+                                "embedding": {
+                                    "provider": "openai",
+                                    "endpoint": "http://127.0.0.1:1990/v1/embeddings",
+                                    "api_key": "test-key"
+                                }
+                            },
+                            "redis_host": "127.0.0.1"
+                        }
+                    }
+                }]]
+            )
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.say(body)
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 21: L2 - first request, cache MISS, stored in L2
+--- request
+POST /semantic
+{"messages":[{"role":"user","content":"What is the capital of France??"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+
+
+
+=== TEST 22: L2 - different wording hits L2 (same vector from fixture)
+--- request
+POST /semantic
+{"messages":[{"role":"user","content":"Name the capital city of France"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L2
+--- response_body_like eval
+qr/content/
+--- error_log
+ai-cache: L2 hit
+
+
+
+=== TEST 23: L2 - original prompt now hits L1 (backfilled by the L2 hit)
+--- request
+POST /semantic
+{"messages":[{"role":"user","content":"What is the capital of France??"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L1
+--- error_log
+ai-cache: L1 hit for key
+
+
+
+=== TEST 24: L2 degradation - search error results in MISS, not 500
+--- config
+    location /t {
+        content_by_lua_block {
+            local semantic = require("apisix.plugins.ai-cache.semantic")
+            local conf = {
+                redis_host = "127.0.0.1",
+                redis_port = 6379,
+                redis_timeout = 100,
+            }
+
+            local text, sim, err = semantic.search(conf, "", {0.1, 0.2, 0.3}, 0.95)
+            if err then
+                ngx.say("degraded gracefully")
+            else
+                ngx.say("miss, no error")
+            end
+        }
+    }
+--- response_body_like eval
+qr/degraded gracefully|miss, no error/

From 8d64a394a4afd5fd5f264b71b86d08bbc72656b3 Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Thu, 30 Apr 2026 01:32:18 +0800
Subject: [PATCH 07/38] feat(ai-cache): set Content-Type text/event-stream on
 streaming cache hits

---
 apisix/plugins/ai-cache.lua | 10 +++++++--
 t/plugin/ai-cache.t         | 44 +++++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua
index 216291692ca9..d0b8e84a9298 100644
--- a/apisix/plugins/ai-cache.lua
+++ b/apisix/plugins/ai-cache.lua
@@ -109,6 +109,8 @@ function _M.access(conf, ctx)
         return
     end
 
+    local is_stream = body_tab.stream == true
+
     -- L1 exact lookup
     if layer_enabled(conf, "exact") then
         local cached_text, written_at, lookup_err = exact.get(conf, scope_hash, prompt_hash)
@@ -118,7 +120,9 @@ function _M.access(conf, ctx)
             core.log.info("ai-cache: L1 hit for key: ", prompt_hash)
             ctx.ai_cache_status = "HIT-L1"
             ctx.ai_cache_written_at = written_at
-            local is_stream = body_tab.stream == true
+            if is_stream then
+                core.response.set_header("Content-Type", "text/event-stream")
+            end
             return core.response.exit(200, proto.build_deny_response({
                 stream = is_stream,
                 text = cached_text,
@@ -157,7 +161,9 @@ function _M.access(conf, ctx)
 
                 ctx.ai_cache_status = "HIT-L2"
                 ctx.ai_cache_similarity = similarity
-                local is_stream = body_tab.stream == true
+                if is_stream then
+                    core.response.set_header("Content-Type", "text/event-stream")
+                end
                 return core.response.exit(200, proto.build_deny_response({
                     stream = is_stream,
                     text = cached_text,
diff --git a/t/plugin/ai-cache.t b/t/plugin/ai-cache.t
index f25379413c40..11aed3bc2da3 100644
--- a/t/plugin/ai-cache.t
+++ b/t/plugin/ai-cache.t
@@ -725,3 +725,47 @@ ai-cache: L1 hit for key
     }
 --- response_body_like eval
 qr/degraded gracefully|miss, no error/
+
+
+
+=== TEST 25: streaming MISS - upstream called, response cached via log phase
+--- request
+POST /chat
+{"messages":[{"role":"user","content":"Stream me something cool"}],"stream":true}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-streaming.sse
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+
+
+
+=== TEST 26: streaming HIT - Content-Type is text/event-stream, SSE body returned
+--- request
+POST /chat
+{"messages":[{"role":"user","content":"Stream me something cool"}],"stream":true}
+--- more_headers
+Content-Type: application/json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L1
+Content-Type: text/event-stream
+--- response_body_like eval
+qr/data:.*content/
+--- wait: 1
+
+
+
+=== TEST 27: non-streaming HIT after streaming MISS - returns JSON
+--- request
+POST /chat
+{"messages":[{"role":"user","content":"Stream me something cool"}]}
+--- more_headers
+Content-Type: application/json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L1
+Content-Type: application/json
+--- response_body_like eval
+qr/content/

From 6f72c377dc55728e095d79e733901d3557978f0d Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Thu, 30 Apr 2026 04:32:12 +0800
Subject: [PATCH 08/38] feat(ai-cache): add Prometheus metrics for hits,
 misses, and embedding latency

---
 apisix/plugins/ai-cache.lua            |  19 +-
 apisix/plugins/prometheus/exporter.lua |  53 +++++
 t/plugin/prometheus-ai-cache.t         | 305 +++++++++++++++++++++++++
 3 files changed, 369 insertions(+), 8 deletions(-)
 create mode 100644 t/plugin/prometheus-ai-cache.t

diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua
index d0b8e84a9298..94de0043c123 100644
--- a/apisix/plugins/ai-cache.lua
+++ b/apisix/plugins/ai-cache.lua
@@ -22,6 +22,7 @@ local semantic  = require("apisix.plugins.ai-cache.semantic")
 local protocols = require("apisix.plugins.ai-protocols")
 local http      = require("resty.http")
 local ngx_time  = ngx.time
+local ngx_now   = ngx.now
 local tostring  = tostring
 local table_concat = table.concat
 
@@ -78,7 +79,6 @@ function _M.access(conf, ctx)
     local body_tab, err = core.request.get_json_request_body_table()
     if not body_tab then
         core.log.warn("ai-cache: failed to read request body: ", err or "unknown error")
-        ctx.ai_cache_miss   = true
         ctx.ai_cache_status = "MISS"
         return
     end
@@ -86,7 +86,6 @@ function _M.access(conf, ctx)
     local protocol_name = protocols.detect(body_tab, ctx)
     if not protocol_name then
         core.log.warn("ai-cache: could not detect AI protocol, skipping cache")
-        ctx.ai_cache_miss = true
         ctx.ai_cache_status = "MISS"
         return
     end
@@ -94,7 +93,6 @@ function _M.access(conf, ctx)
     local proto = protocols.get(protocol_name)
     local contents = proto.extract_request_content(body_tab)
     if not contents or #contents == 0 then
-        ctx.ai_cache_miss   = true
         ctx.ai_cache_status = "MISS"
         return
     end
@@ -104,7 +102,6 @@ function _M.access(conf, ctx)
     local prompt_hash, hash_err = exact.compute_prompt_hash(prompt_text)
     if not prompt_hash then
         core.log.warn("ai-cache: failed to compute prompt hash: ", hash_err)
-        ctx.ai_cache_miss   = true
         ctx.ai_cache_status = "MISS"
         return
     end
@@ -122,6 +119,8 @@ function _M.access(conf, ctx)
             ctx.ai_cache_written_at = written_at
             if is_stream then
                 core.response.set_header("Content-Type", "text/event-stream")
+            else
+                core.response.set_header("Content-Type", "application/json")
             end
             return core.response.exit(200, proto.build_deny_response({
                 stream = is_stream,
@@ -136,10 +135,13 @@ function _M.access(conf, ctx)
         local emb_driver = require("apisix.plugins.ai-cache.embeddings." .. emb_conf.provider)
         local httpc = http.new()
 
+        local t0 = ngx_now()
         local embedding, _, emb_err = emb_driver.get_embeddings(emb_conf, prompt_text, httpc, true)
         if not embedding then
             core.log.warn("ai-cache: embedding fetch failed (degrading to MISS): ", emb_err)
         else
+            ctx.ai_cache_embedding_latency_ms = (ngx_now() - t0) * 1000
+            ctx.ai_cache_embedding_provider = emb_conf.provider
             ctx.ai_cache_embedding = embedding
 
             local threshold = conf.semantic.similarity_threshold or 0.95
@@ -163,6 +165,8 @@ function _M.access(conf, ctx)
                 ctx.ai_cache_similarity = similarity
                 if is_stream then
                     core.response.set_header("Content-Type", "text/event-stream")
+                else
+                    core.response.set_header("Content-Type", "application/json")
                 end
                 return core.response.exit(200, proto.build_deny_response({
                     stream = is_stream,
@@ -172,7 +176,6 @@ function _M.access(conf, ctx)
         end
     end
 
-    ctx.ai_cache_miss   = true
     ctx.ai_cache_status = "MISS"
     ctx.ai_cache_scope_hash  = scope_hash
     ctx.ai_cache_prompt_hash = prompt_hash
@@ -204,12 +207,12 @@ end
 
 
 function _M.log(conf, ctx)
-    if not ctx.ai_cache_miss or ctx.ai_cache_bypass then
+    if ctx.ai_cache_status ~= "MISS" then
         return
     end
 
-    local status = core.response.get_upstream_status(ctx) or ngx.status
-    if not status or status < 200 or status >= 300 then
+    local upstream_status = core.response.get_upstream_status(ctx) or ngx.status
+    if not upstream_status or upstream_status < 200 or upstream_status >= 300 then
         return
     end
 
diff --git a/apisix/plugins/prometheus/exporter.lua b/apisix/plugins/prometheus/exporter.lua
index ce89ca03302a..cf008400b0f0 100644
--- a/apisix/plugins/prometheus/exporter.lua
+++ b/apisix/plugins/prometheus/exporter.lua
@@ -160,6 +160,12 @@ function _M.http_init(prometheus_enabled_in_stream)
                                                             "llm_completion_tokens", "expire")
     local llm_active_connections_exptime = core.table.try_read_attr(attr, "metrics",
                                                             "llm_active_connections", "expire")
+    local ai_cache_hits_exptime = core.table.try_read_attr(attr, "metrics",
+                                                            "ai_cache_hits", "expire")
+    local ai_cache_misses_exptime = core.table.try_read_attr(attr, "metrics",
+                                                            "ai_cache_misses", "expire")
+    local ai_cache_embedding_latency_exptime = core.table.try_read_attr(attr, "metrics",
+                                                            "ai_cache_embedding_latency", "expire")
 
     prometheus = base_prometheus.init("prometheus-metrics", metric_prefix)
 
@@ -260,6 +266,29 @@ function _M.http_init(prometheus_enabled_in_stream)
             unpack(extra_labels("llm_active_connections"))},
             llm_active_connections_exptime)
 
+    metrics.ai_cache_hits = prometheus:counter("ai_cache_hits_total",
+            "AI cache hit count by layer",
+            {"route_id", "service_id", "consumer", "layer",
+            unpack(extra_labels("ai_cache_hits"))},
+            ai_cache_hits_exptime)
+
+    metrics.ai_cache_misses = prometheus:counter("ai_cache_misses_total",
+            "AI cache miss count",
+            {"route_id", "service_id", "consumer",
+            unpack(extra_labels("ai_cache_misses"))},
+            ai_cache_misses_exptime)
+
+    local ai_cache_embedding_latency_buckets = DEFAULT_BUCKETS
+    if attr and attr.ai_cache_embedding_latency_buckets then
+        ai_cache_embedding_latency_buckets = attr.ai_cache_embedding_latency_buckets
+    end
+    metrics.ai_cache_embedding_latency = prometheus:histogram("ai_cache_embedding_latency",
+            "AI cache embedding API call latency in milliseconds",
+            {"route_id", "service_id", "consumer", "provider",
+            unpack(extra_labels("ai_cache_embedding_latency"))},
+            ai_cache_embedding_latency_buckets,
+            ai_cache_embedding_latency_exptime)
+
     if prometheus_enabled_in_stream then
         init_stream_metrics()
     end
@@ -377,6 +406,29 @@ function _M.http_log(conf, ctx)
                 vars.request_type, vars.request_llm_model, vars.llm_model,
                 unpack(extra_labels("llm_completion_tokens", ctx))))
     end
+
+    if ctx.ai_cache_status then
+        if ctx.ai_cache_status == "HIT-L1" then
+            metrics.ai_cache_hits:inc(1,
+                gen_arr(route_id, service_id, consumer_name, "l1",
+                    unpack(extra_labels("ai_cache_hits", ctx))))
+        elseif ctx.ai_cache_status == "HIT-L2" then
+            metrics.ai_cache_hits:inc(1,
+                gen_arr(route_id, service_id, consumer_name, "l2",
+                    unpack(extra_labels("ai_cache_hits", ctx))))
+        elseif ctx.ai_cache_status == "MISS" then
+            metrics.ai_cache_misses:inc(1,
+                gen_arr(route_id, service_id, consumer_name,
+                    unpack(extra_labels("ai_cache_misses", ctx))))
+        end
+
+        if ctx.ai_cache_embedding_latency_ms then
+            metrics.ai_cache_embedding_latency:observe(ctx.ai_cache_embedding_latency_ms,
+                gen_arr(route_id, service_id, consumer_name,
+                    ctx.ai_cache_embedding_provider or "",
+                    unpack(extra_labels("ai_cache_embedding_latency", ctx))))
+        end
+    end
 end
 
 
@@ -790,6 +842,7 @@ function _M.dec_llm_active_connections(ctx)
     inc_llm_active_connections(ctx, -1)
 end
 
+
 function _M.get_prometheus()
     return prometheus
 end
diff --git a/t/plugin/prometheus-ai-cache.t b/t/plugin/prometheus-ai-cache.t
new file mode 100644
index 000000000000..fc4eb02264bc
--- /dev/null
+++ b/t/plugin/prometheus-ai-cache.t
@@ -0,0 +1,305 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+BEGIN {
+    $ENV{TEST_ENABLE_CONTROL_API_V1} = "0";
+
+    if ($ENV{TEST_NGINX_CHECK_LEAK}) {
+        $SkipReason = "unavailable for the hup tests";
+    } else {
+        $ENV{TEST_NGINX_USE_HUP} = 1;
+        undef $ENV{TEST_NGINX_USE_STAP};
+    }
+}
+
+use t::APISIX 'no_plan';
+
+repeat_each(1);
+no_long_string();
+no_shuffle();
+no_root_location();
+
+add_block_preprocessor(sub {
+    my ($block) = @_;
+
+    if (!defined $block->request) {
+        $block->set_value("request", "GET /t");
+    }
+
+    my $user_yaml_config = <<_EOC_;
+plugin_attr:
+    prometheus:
+        refresh_interval: 0.1
+plugins:
+  - ai-proxy
+  - ai-cache
+  - prometheus
+  - public-api
+_EOC_
+    $block->set_value("extra_yaml_config", $user_yaml_config);
+
+    if (!defined $block->http_config) {
+        $block->set_value("http_config", <<_EOC_);
+server {
+    listen 1990;
+    default_type 'application/json';
+
+    location /v1/embeddings {
+        content_by_lua_block {
+            local fixture_loader = require("lib.fixture_loader")
+            local content, err = fixture_loader.load("openai/embeddings-list.json")
+            if not content then
+                ngx.status = 500
+                ngx.say(err)
+                return
+            end
+
+            ngx.status = 200
+            ngx.print(content)
+        }
+    }
+}
+_EOC_
+    }
+});
+
+run_tests;
+
+__DATA__
+
+=== TEST 1: set up routes
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+
+            local routes = {
+                {
+                    url = "/apisix/admin/routes/1",
+                    data = [[{
+                        "uri": "/chat",
+                        "plugins": {
+                            "prometheus": {},
+                            "ai-proxy": {
+                                "provider": "openai",
+                                "auth": {
+                                    "header": {
+                                        "Authorization": "Bearer test-key"
+                                    }
+                                },
+                                "override": {
+                                    "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+                                }
+                            },
+                            "ai-cache": {
+                                "layers": ["exact"],
+                                "exact": { "ttl": 60 },
+                                "redis_host": "127.0.0.1",
+                                "bypass_on": [{"header": "X-Cache-Bypass", "equals": "1"}]
+                            }
+                        }
+                    }]],
+                },
+                {
+                    url = "/apisix/admin/routes/2",
+                    data = [[{
+                        "uri": "/semantic",
+                        "plugins": {
+                            "prometheus": {},
+                            "ai-proxy": {
+                                "provider": "openai",
+                                "auth": {
+                                    "header": {
+                                        "Authorization": "Bearer test-key"
+                                    }
+                                },
+                                "override": {
+                                    "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+                                }
+                            },
+                            "ai-cache": {
+                                "layers": ["exact", "semantic"],
+                                "exact": { "ttl": 60 },
+                                "semantic": {
+                                    "similarity_threshold": 0.90,
+                                    "ttl": 300,
+                                    "embedding": {
+                                        "provider": "openai",
+                                        "endpoint": "http://127.0.0.1:1990/v1/embeddings",
+                                        "api_key": "test-key"
+                                    }
+                                },
+                                "redis_host": "127.0.0.1"
+                            }
+                        }
+                    }]],
+                },
+                {
+                    url = "/apisix/admin/routes/metrics",
+                    data = [[{
+                        "plugins": {
+                            "public-api": {}
+                        },
+                        "uri": "/apisix/prometheus/metrics"
+                    }]],
+                },
+            }
+
+            for _, route in ipairs(routes) do
+                local code, body = t(route.url, ngx.HTTP_PUT, route.data)
+                if code >= 300 then
+                    ngx.status = code
+                end
+                ngx.say(body)
+            end
+        }
+    }
+--- response_body eval
+"passed\n" x 3
+
+
+
+=== TEST 2: MISS request - upstream called
+--- request
+POST /chat
+{"messages":[{"role":"user","content":"What is the meaning of life?"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+
+
+
+=== TEST 3: same request - HIT-L1
+--- request
+POST /chat
+{"messages":[{"role":"user","content":"What is the meaning of life?"}]}
+--- more_headers
+Content-Type: application/json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L1
+--- wait: 1
+
+
+
+=== TEST 4: verify miss counter
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_misses_total\{route_id="1",service_id="",consumer=""\} 1/
+
+
+
+=== TEST 5: verify hit counter with layer label
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_hits_total\{route_id="1",service_id="",consumer="",layer="l1"\} 1/
+
+
+
+=== TEST 6: BYPASS request - upstream called, no cache interaction
+--- request
+POST /chat
+{"messages":[{"role":"user","content":"What is the meaning of life?"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+X-Cache-Bypass: 1
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: BYPASS
+
+
+
+=== TEST 7: verify BYPASS did not increment misses counter
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_misses_total\{route_id="1",service_id="",consumer=""\} 1\n/
+
+
+
+=== TEST 8: cleanup Redis L2 state before semantic tests
+--- config
+    location /t {
+        content_by_lua_block {
+            local redis = require("resty.redis")
+            local red = redis:new()
+            red:set_timeout(1000)
+            assert(red:connect("127.0.0.1", 6379))
+
+            red["FT.DROPINDEX"](red, "ai-cache-idx", "DD")
+
+            local keys = red:keys("ai-cache:*")
+            if type(keys) == "table" and #keys > 0 then
+                red:del(unpack(keys))
+            end
+
+            red:close()
+            ngx.say("ok")
+        }
+    }
+--- response_body
+ok
+
+
+
+=== TEST 9: L2 first request - MISS, embedding API called
+--- request
+POST /semantic
+{"messages":[{"role":"user","content":"What is the capital of France??"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- wait: 1
+
+
+
+=== TEST 10: L2 second request - different wording, HIT-L2
+--- request
+POST /semantic
+{"messages":[{"role":"user","content":"Name the capital city of France"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L2
+--- wait: 1
+
+
+
+=== TEST 11: verify hits counter with layer="l2"
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_hits_total\{route_id="2",service_id="",consumer="",layer="l2"\} 1/
+
+
+
+=== TEST 12: verify embedding latency histogram with provider label
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_embedding_latency_count\{route_id="2",service_id="",consumer="",provider="openai"\} 2/

From 4cf5c1c0bbeaa426ce1e3014bd7ca3ece2939e3c Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Thu, 30 Apr 2026 06:35:10 +0800
Subject: [PATCH 09/38] docs(ai-cache): add plugin documentation

---
 apisix/plugins/ai-cache.lua        |  10 +-
 docs/en/latest/config.json         |   1 +
 docs/en/latest/plugins/ai-cache.md | 656 +++++++++++++++++++++++++++++
 3 files changed, 666 insertions(+), 1 deletion(-)
 create mode 100644 docs/en/latest/plugins/ai-cache.md

diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua
index 94de0043c123..7caf6cf548ee 100644
--- a/apisix/plugins/ai-cache.lua
+++ b/apisix/plugins/ai-cache.lua
@@ -221,6 +221,14 @@ function _M.log(conf, ctx)
         return
     end
 
+    local max_size = conf.max_cache_body_size or 1048576
+    if #response_text > max_size then
+        core.log.warn("ai-cache: response size ", #response_text,
+                      " exceeds max_cache_body_size ", max_size,
+                      ", skipping cache write")
+        return
+    end
+
     local exact_enabled = layer_enabled(conf, "exact")
     local semantic_enabled = layer_enabled(conf, "semantic")
     local ttl_exact = (conf.exact and conf.exact.ttl) or 3600
@@ -233,7 +241,7 @@ function _M.log(conf, ctx)
         if premature then
             return
         end
-    
+
         if exact_enabled then
             local err = exact.set(conf, scope_hash, prompt_hash, response_text, ttl_exact)
             if err then
diff --git a/docs/en/latest/config.json b/docs/en/latest/config.json
index d24eacc3f8e9..c198826c7505 100644
--- a/docs/en/latest/config.json
+++ b/docs/en/latest/config.json
@@ -75,6 +75,7 @@
             "plugins/ai-proxy-multi",
             "plugins/ai-rate-limiting",
             "plugins/ai-prompt-guard",
+            "plugins/ai-cache",
             "plugins/ai-aws-content-moderation",
             "plugins/ai-aliyun-content-moderation",
             "plugins/ai-prompt-decorator",
diff --git a/docs/en/latest/plugins/ai-cache.md b/docs/en/latest/plugins/ai-cache.md
new file mode 100644
index 000000000000..71de88a32261
--- /dev/null
+++ b/docs/en/latest/plugins/ai-cache.md
@@ -0,0 +1,656 @@
+---
+title: ai-cache
+keywords:
+  - Apache APISIX
+  - API Gateway
+  - Plugin
+  - ai-cache
+description: The ai-cache Plugin caches LLM responses in Redis so identical or semantically similar prompts are served from cache, reducing latency and upstream cost.
+---
+
+<!--
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+-->
+
+<head>
+  <link rel="canonical" href="https://docs.api7.ai/hub/ai-cache" />
+</head>
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+## Description
+
+The `ai-cache` Plugin caches LLM responses in Redis so identical or semantically similar prompts are served from cache instead of incurring another upstream call. It supports two cache layers: an exact-match layer (`exact`) keyed by a hash of the prompt, and a semantic layer (`semantic`) that compares prompt embeddings via Redis Stack vector search. Either layer can be enabled independently, and a hit on the semantic layer backfills the exact layer so subsequent identical prompts return immediately.
+
+The Plugin should be used together with [ai-proxy](./ai-proxy.md) or [ai-proxy-multi](./ai-proxy-multi.md) on the same Route. The semantic layer requires Redis Stack with the RediSearch module and an embedding provider (OpenAI or Azure OpenAI).
+
+## Plugin Attributes
+
+| Name | Type | Required | Default | Valid values | Description |
+| --- | --- | --- | --- | --- | --- |
+| `layers` | array[string] | False | `["exact", "semantic"]` | `"exact"`, `"semantic"` | Cache layers to enable, queried in order. |
+| `exact.ttl` | integer | False | `3600` | ≥ 1 | Time-to-live in seconds for exact-layer entries. |
+| `semantic.similarity_threshold` | number | False | `0.95` | 0–1 | Minimum cosine similarity required for a semantic-layer hit. |
+| `semantic.ttl` | integer | False | `86400` | ≥ 1 | Time-to-live in seconds for semantic-layer entries. |
+| `semantic.embedding.provider` | string | True (if semantic enabled) | | `"openai"`, `"azure_openai"` | Embedding API provider. |
+| `semantic.embedding.endpoint` | string | True (if semantic enabled) | | | HTTPS URL of the embedding API. |
+| `semantic.embedding.api_key` | string | True (if semantic enabled) | | | API key for the embedding provider. Stored encrypted. |
+| `semantic.embedding.model` | string | False | | | Embedding model name. Uses provider default if omitted. |
+| `cache_key.include_consumer` | boolean | False | `false` | | If `true`, partition the cache by consumer name. |
+| `cache_key.include_vars` | array[string] | False | `[]` | | Additional `ctx.var` names included in the cache key, for example `["$http_x_tenant_id"]`. |
+| `bypass_on` | array[object] | False | | | List of `{header, equals}` rules. If any matches, the request bypasses the cache. |
+| `max_cache_body_size` | integer | False | `1048576` | ≥ 1 | Maximum response size in bytes to write to cache. Larger responses pass through but are not cached. |
+| `headers.cache_status` | string | False | `"X-AI-Cache-Status"` | | Response header for cache status (`HIT-L1`, `HIT-L2`, `MISS`, `BYPASS`). |
+| `headers.cache_age` | string | False | `"X-AI-Cache-Age"` | | Response header for the age in seconds of an exact-layer hit. |
+| `headers.cache_similarity` | string | False | `"X-AI-Cache-Similarity"` | | Response header for the similarity score of a semantic-layer hit. |
+
+Redis connection fields (`redis_host`, `redis_port`, `redis_password`, `redis_database`, `redis_timeout`, `redis_ssl`, `redis_ssl_verify`, `redis_username`, `redis_keepalive_timeout`, `redis_keepalive_pool`) follow the shared Redis schema. At minimum, `redis_host` is required.
+
+## Examples
+
+The following examples use OpenAI as the Upstream service provider. Before proceeding, create an [OpenAI account](https://openai.com) and an [API key](https://openai.com/blog/openai-api). You can optionally save the key to an environment variable:
+
+```shell
+export OPENAI_API_KEY=<YOUR_OPENAI_API_KEY>
+```
+
+If you are working with other LLM providers, please refer to the provider's documentation to obtain an API key.
+
+:::note
+
+You can fetch the `admin_key` from `config.yaml` and save to an environment variable with the following command:
+
+```shell
+admin_key=$(yq '.deployment.admin.admin_key[0].key' conf/config.yaml | sed 's/"//g')
+```
+
+:::
+
+### Cache Identical Prompts with the Exact Layer
+
+The following example demonstrates how to use the `ai-cache` Plugin with the exact layer only, so that identical prompts are returned from cache.
+
+<Tabs groupId="api">
+<TabItem value="admin-api" label="Admin API">
+
+Create a Route that uses [ai-proxy](./ai-proxy.md) to proxy to OpenAI and `ai-cache` to cache exact-match prompts:
+
+```shell
+curl "http://127.0.0.1:9180/apisix/admin/routes/1" -X PUT \
+  -H "X-API-KEY: ${admin_key}" \
+  -d '{
+    "uri": "/anything",
+    "methods": ["POST"],
+    "plugins": {
+      "ai-proxy": {
+        "provider": "openai",
+        "auth": {
+          "header": {
+            "Authorization": "Bearer '"$OPENAI_API_KEY"'"
+          }
+        },
+        "options": {
+          "model": "gpt-4"
+        }
+      },
+      "ai-cache": {
+        "layers": ["exact"],
+        "exact": { "ttl": 3600 },
+        "redis_host": "127.0.0.1"
+      }
+    }
+  }'
+```
+
+</TabItem>
+<TabItem value="adc" label="ADC">
+
+Create a Route with the `ai-cache` and [ai-proxy](./ai-proxy.md) Plugins configured as such:
+
+```yaml title="adc.yaml"
+services:
+  - name: ai-cache-service
+    routes:
+      - name: ai-cache-route
+        uris:
+          - /anything
+        methods:
+          - POST
+        plugins:
+          ai-proxy:
+            provider: openai
+            auth:
+              header:
+                Authorization: "Bearer ${OPENAI_API_KEY}"
+            options:
+              model: gpt-4
+          ai-cache:
+            layers:
+              - exact
+            exact:
+              ttl: 3600
+            redis_host: 127.0.0.1
+```
+
+Synchronize the configuration to the gateway:
+
+```shell
+adc sync -f adc.yaml
+```
+
+</TabItem>
+</Tabs>
+
+Send a request to the Route:
+
+```shell
+curl -i "http://127.0.0.1:9080/anything" -X POST \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messages": [
+      { "role": "user", "content": "What is the capital of France?" }
+    ]
+  }'
+```
+
+The first request reaches OpenAI and you should receive a response similar to the following:
+
+```text
+HTTP/1.1 200 OK
+Content-Type: application/json
+Server: APISIX/3.16.0
+X-AI-Cache-Status: MISS
+
+{
+  "id": "chatcmpl-Da7Iqsqz9gc8Mkf07Hn4NCzAH5Ri1",
+  "object": "chat.completion",
+  "created": 1777500252,
+  "model": "gpt-4o-mini-2024-07-18",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "The capital of France is Paris.",
+        "refusal": null
+      },
+      "finish_reason": "stop"
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 14,
+    "completion_tokens": 7,
+    "total_tokens": 21
+  },
+  "system_fingerprint": "fp_d3214ccada"
+}
+```
+
+Send the same request again:
+
+```shell
+curl -i "http://127.0.0.1:9080/anything" -X POST \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messages": [
+      { "role": "user", "content": "What is the capital of France?" }
+    ]
+  }'
+```
+
+The second request returns from cache without contacting OpenAI. The cached response is replayed from Redis, so the body is shorter and does not contain the original `created`, `model`, `usage`, or `system_fingerprint` fields:
+
+```text
+HTTP/1.1 200 OK
+Content-Type: application/json
+Server: APISIX/3.16.0
+X-AI-Cache-Status: HIT-L1
+X-AI-Cache-Age: 4
+
+{
+  "id": "f558665e-3a03-42e3-9aa9-f54c402927c0",
+  "object": "chat.completion",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "content": "The capital of France is Paris.",
+        "role": "assistant"
+      },
+      "finish_reason": "stop"
+    }
+  ]
+}
+```
+
+### Cache Paraphrased Prompts with the Semantic Layer
+
+The following example demonstrates how to enable the semantic layer so that prompts with different wording but similar meaning are served from cache.
+
+<Tabs groupId="api">
+<TabItem value="admin-api" label="Admin API">
+
+```shell
+curl "http://127.0.0.1:9180/apisix/admin/routes/1" -X PUT \
+  -H "X-API-KEY: ${admin_key}" \
+  -d '{
+    "uri": "/anything",
+    "methods": ["POST"],
+    "plugins": {
+      "ai-proxy": {
+        "provider": "openai",
+        "auth": {
+          "header": {
+            "Authorization": "Bearer '"$OPENAI_API_KEY"'"
+          }
+        },
+        "options": {
+          "model": "gpt-4"
+        }
+      },
+      "ai-cache": {
+        "layers": ["exact", "semantic"],
+        "exact": { "ttl": 3600 },
+        "semantic": {
+          "similarity_threshold": 0.92,
+          "ttl": 86400,
+          "embedding": {
+            "provider": "openai",
+            "endpoint": "https://api.openai.com/v1/embeddings",
+            "api_key": "'"$OPENAI_API_KEY"'",
+            "model": "text-embedding-3-small"
+          }
+        },
+        "redis_host": "127.0.0.1"
+      }
+    }
+  }'
+```
+
+</TabItem>
+<TabItem value="adc" label="ADC">
+
+```yaml title="adc.yaml"
+services:
+  - name: ai-cache-service
+    routes:
+      - name: ai-cache-route
+        uris:
+          - /anything
+        methods:
+          - POST
+        plugins:
+          ai-proxy:
+            provider: openai
+            auth:
+              header:
+                Authorization: "Bearer ${OPENAI_API_KEY}"
+            options:
+              model: gpt-4
+          ai-cache:
+            layers:
+              - exact
+              - semantic
+            exact:
+              ttl: 3600
+            semantic:
+              similarity_threshold: 0.92
+              ttl: 86400
+              embedding:
+                provider: openai
+                endpoint: https://api.openai.com/v1/embeddings
+                api_key: "${OPENAI_API_KEY}"
+                model: text-embedding-3-small
+            redis_host: 127.0.0.1
+```
+
+Synchronize the configuration to the gateway:
+
+```shell
+adc sync -f adc.yaml
+```
+
+</TabItem>
+</Tabs>
+
+Send a first request:
+
+```shell
+curl -i "http://127.0.0.1:9080/anything" -X POST \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messages": [
+      { "role": "user", "content": "What is the capital of France?" }
+    ]
+  }'
+```
+
+The first request reaches OpenAI:
+
+```text
+HTTP/1.1 200 OK
+Content-Type: application/json
+Server: APISIX/3.16.0
+X-AI-Cache-Status: MISS
+
+{
+  "id": "chatcmpl-Da7Iqsqz9gc8Mkf07Hn4NCzAH5Ri1",
+  "object": "chat.completion",
+  "model": "gpt-4o-mini-2024-07-18",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "The capital of France is Paris."
+      },
+      "finish_reason": "stop"
+    }
+  ],
+  "usage": { "prompt_tokens": 14, "completion_tokens": 7, "total_tokens": 21 }
+}
+```
+
+Wait a couple of seconds for the semantic-layer write to complete in the background, then send a second request with paraphrased wording:
+
+```shell
+curl -i "http://127.0.0.1:9080/anything" -X POST \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messages": [
+      { "role": "user", "content": "capital of France what is?" }
+    ]
+  }'
+```
+
+The semantic layer matches the embedding (cosine similarity above the threshold) and returns the cached response without contacting OpenAI:
+
+```text
+HTTP/1.1 200 OK
+Content-Type: application/json
+Server: APISIX/3.16.0
+X-AI-Cache-Status: HIT-L2
+X-AI-Cache-Similarity: 0.9720680713654
+
+{
+  "id": "40b612a5-1424-4096-b7ec-8537a1ee6fd3",
+  "object": "chat.completion",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "content": "The capital of France is Paris.",
+        "role": "assistant"
+      },
+      "finish_reason": "stop"
+    }
+  ]
+}
+```
+
+A semantic-layer hit also backfills the exact layer, so an immediate retry of the same paraphrase returns `X-AI-Cache-Status: HIT-L1`.
+
+### Isolate Cache Entries Per Consumer or Tenant
+
+The following example demonstrates how to namespace cache entries so that one consumer's response is not served to another. Use `cache_key.include_consumer` to partition by consumer name, or `cache_key.include_vars` to include request variables such as a tenant header.
+
+<Tabs groupId="api">
+<TabItem value="admin-api" label="Admin API">
+
+```shell
+curl "http://127.0.0.1:9180/apisix/admin/routes/1" -X PUT \
+  -H "X-API-KEY: ${admin_key}" \
+  -d '{
+    "uri": "/anything",
+    "methods": ["POST"],
+    "plugins": {
+      "ai-proxy": {
+        "provider": "openai",
+        "auth": {
+          "header": {
+            "Authorization": "Bearer '"$OPENAI_API_KEY"'"
+          }
+        },
+        "options": {
+          "model": "gpt-4"
+        }
+      },
+      "ai-cache": {
+        "layers": ["exact"],
+        "exact": { "ttl": 3600 },
+        "cache_key": {
+          "include_consumer": true,
+          "include_vars": ["$http_x_tenant_id"]
+        },
+        "redis_host": "127.0.0.1"
+      }
+    }
+  }'
+```
+
+</TabItem>
+<TabItem value="adc" label="ADC">
+
+```yaml title="adc.yaml"
+services:
+  - name: ai-cache-service
+    routes:
+      - name: ai-cache-route
+        uris:
+          - /anything
+        methods:
+          - POST
+        plugins:
+          ai-proxy:
+            provider: openai
+            auth:
+              header:
+                Authorization: "Bearer ${OPENAI_API_KEY}"
+            options:
+              model: gpt-4
+          ai-cache:
+            layers:
+              - exact
+            exact:
+              ttl: 3600
+            cache_key:
+              include_consumer: true
+              include_vars:
+                - "$http_x_tenant_id"
+            redis_host: 127.0.0.1
+```
+
+Synchronize the configuration to the gateway:
+
+```shell
+adc sync -f adc.yaml
+```
+
+</TabItem>
+</Tabs>
+
+Two requests with the same prompt but different `X-Tenant-Id` headers each receive `X-AI-Cache-Status: MISS`, because the cache key now includes the tenant identifier.
+
+### Bypass the Cache on a Header
+
+The following example demonstrates how to skip the cache entirely when a request carries a specific header, for example to refresh a cached response or to support staff debugging.
+
+<Tabs groupId="api">
+<TabItem value="admin-api" label="Admin API">
+
+```shell
+curl "http://127.0.0.1:9180/apisix/admin/routes/1" -X PUT \
+  -H "X-API-KEY: ${admin_key}" \
+  -d '{
+    "uri": "/anything",
+    "methods": ["POST"],
+    "plugins": {
+      "ai-proxy": {
+        "provider": "openai",
+        "auth": {
+          "header": {
+            "Authorization": "Bearer '"$OPENAI_API_KEY"'"
+          }
+        },
+        "options": {
+          "model": "gpt-4"
+        }
+      },
+      "ai-cache": {
+        "layers": ["exact"],
+        "exact": { "ttl": 3600 },
+        "bypass_on": [
+          { "header": "X-Cache-Bypass", "equals": "1" }
+        ],
+        "redis_host": "127.0.0.1"
+      }
+    }
+  }'
+```
+
+</TabItem>
+<TabItem value="adc" label="ADC">
+
+```yaml title="adc.yaml"
+services:
+  - name: ai-cache-service
+    routes:
+      - name: ai-cache-route
+        uris:
+          - /anything
+        methods:
+          - POST
+        plugins:
+          ai-proxy:
+            provider: openai
+            auth:
+              header:
+                Authorization: "Bearer ${OPENAI_API_KEY}"
+            options:
+              model: gpt-4
+          ai-cache:
+            layers:
+              - exact
+            exact:
+              ttl: 3600
+            bypass_on:
+              - header: X-Cache-Bypass
+                equals: "1"
+            redis_host: 127.0.0.1
+```
+
+Synchronize the configuration to the gateway:
+
+```shell
+adc sync -f adc.yaml
+```
+
+</TabItem>
+</Tabs>
+
+Send a request with the bypass header:
+
+```shell
+curl -i "http://127.0.0.1:9080/anything" -X POST \
+  -H "Content-Type: application/json" \
+  -H "X-Cache-Bypass: 1" \
+  -d '{
+    "messages": [
+      { "role": "user", "content": "What is the capital of France?" }
+    ]
+  }'
+```
+
+The request reaches OpenAI even though a cached entry exists, and the response is not written back to cache. You can confirm the upstream was contacted because the response includes the original `created`, `model`, `usage`, and `system_fingerprint` fields:
+
+```text
+HTTP/1.1 200 OK
+Content-Type: application/json
+Server: APISIX/3.16.0
+X-AI-Cache-Status: BYPASS
+
+{
+  "id": "chatcmpl-Da7N4E9fA6KoQ7av98hL0zxplPCcD",
+  "object": "chat.completion",
+  "created": 1777500514,
+  "model": "gpt-4o-mini-2024-07-18",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "The capital of France is Paris.",
+        "refusal": null
+      },
+      "finish_reason": "stop"
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 14,
+    "completion_tokens": 7,
+    "total_tokens": 21
+  },
+  "system_fingerprint": "fp_d3214ccada"
+}
+```
+
+## Caveats
+
+### The semantic-layer write is asynchronous
+
+After a `MISS`, the embedding fetch and Redis vector store happen in a background timer. If you send a paraphrased prompt immediately after the first request, you may see another `MISS` because the entry has not been stored yet. Wait a couple of seconds before sending a paraphrase to verify a semantic hit.
+
+### Similarity is mathematical, not human-judged
+
+Two prompts that look semantically equivalent to a human can score below the configured `similarity_threshold` and therefore miss the cache. Conversely, a small wording change can flip the result. For example, with `similarity_threshold` set to `0.85` and the cache primed with `"What is the capital of France?"`:
+
+| Prompt | Status | Similarity |
+|--------|--------|------------|
+| `capital of France?` | `HIT-L2` | `0.850` |
+| `capital of France what?` | `MISS` | (below threshold) |
+| `capital of France what is?` | `HIT-L2` | `0.972` |
+| `capital of France what please?` | `HIT-L2` | `0.924` |
+| `capital of France what is please tell me?` | `MISS` | (below threshold) |
+
+Lower the threshold to catch more paraphrases at the cost of occasionally serving a cached answer for a genuinely different question. Tune empirically against your traffic.
+
+### Embedding model dimensions are baked into the index
+
+Redis Stack creates the vector index on the first request with a fixed `DIM` matching the embedding vector size (for example `1536` for `text-embedding-3-small`, `3072` for `text-embedding-3-large`). If you switch embedding models, or if the index was created with different-sized vectors during testing, subsequent requests will fail with a size-mismatch error in the APISIX warn log:
+
+```text
+ai-cache: L2 search error: Error parsing vector similarity query:
+query vector blob size (6144) does not match index's expected size (12).
+```
+
+The Plugin degrades to `MISS` so requests still succeed, but the semantic layer effectively stops working. Drop the index to recover; it will be recreated on the next request with the correct dimension:
+
+```shell
+docker exec <redis-container> redis-cli FT.DROPINDEX ai-cache-idx DD
+docker exec <redis-container> redis-cli --raw KEYS "ai-cache:*" \
+  | xargs -r docker exec -i <redis-container> redis-cli DEL
+```
+
+### `BYPASS` does not refresh the cache
+
+A request with the bypass header reaches the upstream but its response is not written back. Use it to force a fresh upstream call without invalidating or replacing the existing cached entry.
+
+### The semantic layer requires Redis Stack
+
+The `FT.CREATE` and `FT.SEARCH` commands used by the semantic layer come from the RediSearch module. Vanilla Redis will fail these commands and the layer will silently degrade to `MISS`. Use a Redis Stack image such as `redis/redis-stack:latest`.

From 62850aa92e165503b37420d1c5f7d5b6b99dae60 Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Thu, 30 Apr 2026 06:55:35 +0800
Subject: [PATCH 10/38] feat(ai-cache): segregate L2 index by embedding
 dimension

---
 apisix/plugins/ai-cache/semantic.lua | 27 +++++++++++++++++----------
 t/plugin/ai-cache.t                  |  1 +
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/apisix/plugins/ai-cache/semantic.lua b/apisix/plugins/ai-cache/semantic.lua
index 38ace604a617..38055daad906 100644
--- a/apisix/plugins/ai-cache/semantic.lua
+++ b/apisix/plugins/ai-cache/semantic.lua
@@ -26,11 +26,18 @@ local tostring = tostring
 local tonumber = tonumber
 local type = type
 
-local INDEX_NAME = "ai-cache-idx"
-local KEY_PREFIX = "ai-cache:l2:"
-
 local _M = {}
 
+
+local function index_name(dim)
+    return "ai-cache-idx-" .. dim
+end
+
+
+local function key_prefix(dim)
+    return "ai-cache:l2:" .. dim .. ":"
+end
+
 local function pack_vector(vec)
     local n = #vec
     local buf = ffi_new("float[?]", n)
@@ -40,17 +47,17 @@ local function pack_vector(vec)
     return ffi_string(buf, n * 4)
 end
 
-local index_ready = false
+local index_ready = {}
 
 local function ensure_index(red, dim)
-    if index_ready then
+    if index_ready[dim] then
         return true
     end
 
     local _, err = red["FT.CREATE"](red,
-        INDEX_NAME,
+        index_name(dim),
         "ON", "HASH",
-        "PREFIX", "1", KEY_PREFIX,
+        "PREFIX", "1", key_prefix(dim),
         "SCHEMA",
         "embedding", "VECTOR", "HNSW", "6",
         "TYPE", "FLOAT32",
@@ -64,7 +71,7 @@ local function ensure_index(red, dim)
         return nil, "FT.CREATE failed: " .. err
     end
 
-    index_ready = true
+    index_ready[dim] = true
     return true
 end
 
@@ -91,7 +98,7 @@ function _M.search(conf, scope_hash, embedding_vec, threshold)
     end
 
     local res, search_err = red["FT.SEARCH"](red,
-        INDEX_NAME,
+        index_name(#embedding_vec),
         query,
         "PARAMS", "2", "vec", binary_vec,
         "SORTBY", "dist", "ASC",
@@ -150,7 +157,7 @@ function _M.store(conf, scope_hash, embedding_vec, text, ttl)
     end
 
     local binary_vec = pack_vector(embedding_vec)
-    local key = KEY_PREFIX .. uuid.generate_v4()
+    local key = key_prefix(#embedding_vec) .. uuid.generate_v4()
 
     local set_ok, set_err = red:hset(key,
         "embedding", binary_vec,
diff --git a/t/plugin/ai-cache.t b/t/plugin/ai-cache.t
index 11aed3bc2da3..e5e012828db9 100644
--- a/t/plugin/ai-cache.t
+++ b/t/plugin/ai-cache.t
@@ -593,6 +593,7 @@ status: 500
             assert(red:connect("127.0.0.1", 6379))
 
             red["FT.DROPINDEX"](red, "ai-cache-idx", "DD")
+            red["FT.DROPINDEX"](red, "ai-cache-idx-3", "DD")
 
             local keys = red:keys("ai-cache:*")
             if type(keys) == "table" and #keys > 0 then

From f16c7e3bad1903e0b58805cdd4d5455f93bb02f0 Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Thu, 30 Apr 2026 06:56:41 +0800
Subject: [PATCH 11/38] docs(ai-cache): update doc to include Redis attr and
 update caveat

---
 docs/en/latest/plugins/ai-cache.md | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/docs/en/latest/plugins/ai-cache.md b/docs/en/latest/plugins/ai-cache.md
index 71de88a32261..0571902bcedf 100644
--- a/docs/en/latest/plugins/ai-cache.md
+++ b/docs/en/latest/plugins/ai-cache.md
@@ -59,8 +59,16 @@ The Plugin should be used together with [ai-proxy](./ai-proxy.md) or [ai-proxy-m
 | `headers.cache_status` | string | False | `"X-AI-Cache-Status"` | | Response header for cache status (`HIT-L1`, `HIT-L2`, `MISS`, `BYPASS`). |
 | `headers.cache_age` | string | False | `"X-AI-Cache-Age"` | | Response header for the age in seconds of an exact-layer hit. |
 | `headers.cache_similarity` | string | False | `"X-AI-Cache-Similarity"` | | Response header for the similarity score of a semantic-layer hit. |
-
-Redis connection fields (`redis_host`, `redis_port`, `redis_password`, `redis_database`, `redis_timeout`, `redis_ssl`, `redis_ssl_verify`, `redis_username`, `redis_keepalive_timeout`, `redis_keepalive_pool`) follow the shared Redis schema. At minimum, `redis_host` is required.
+| `redis_host` | string | True | | | The address of the Redis node. |
+| `redis_port` | integer | False | `6379` | [1,...] | The port of the Redis node. |
+| `redis_username` | string | False | | | The username for Redis if Redis ACL is used. If you use the legacy authentication method `requirepass`, configure only the `redis_password`. |
+| `redis_password` | string | False | | | The password of the Redis node. |
+| `redis_database` | integer | False | `0` | >= 0 | The database number in Redis. |
+| `redis_timeout` | integer | False | `1000` | [1,...] | The Redis timeout value in milliseconds. |
+| `redis_ssl` | boolean | False | `false` | | If `true`, use SSL to connect to Redis. |
+| `redis_ssl_verify` | boolean | False | `false` | | If `true`, verify the server SSL certificate. |
+| `redis_keepalive_timeout` | integer | False | `10000` | [1000,...] | Idle timeout in milliseconds for the Redis connection in the keepalive pool. |
+| `redis_keepalive_pool` | integer | False | `100` | [1,...] | Maximum number of idle Redis connections kept in the keepalive pool. |
 
 ## Examples
 
@@ -630,22 +638,9 @@ Two prompts that look semantically equivalent to a human can score below the con
 
 Lower the threshold to catch more paraphrases at the cost of occasionally serving a cached answer for a genuinely different question. Tune empirically against your traffic.
 
-### Embedding model dimensions are baked into the index
-
-Redis Stack creates the vector index on the first request with a fixed `DIM` matching the embedding vector size (for example `1536` for `text-embedding-3-small`, `3072` for `text-embedding-3-large`). If you switch embedding models, or if the index was created with different-sized vectors during testing, subsequent requests will fail with a size-mismatch error in the APISIX warn log:
-
-```text
-ai-cache: L2 search error: Error parsing vector similarity query:
-query vector blob size (6144) does not match index's expected size (12).
-```
-
-The Plugin degrades to `MISS` so requests still succeed, but the semantic layer effectively stops working. Drop the index to recover; it will be recreated on the next request with the correct dimension:
+### Switching embedding models is safe
 
-```shell
-docker exec <redis-container> redis-cli FT.DROPINDEX ai-cache-idx DD
-docker exec <redis-container> redis-cli --raw KEYS "ai-cache:*" \
-  | xargs -r docker exec -i <redis-container> redis-cli DEL
-```
+The Plugin namespaces the L2 index and entries by embedding dimension (for example `1536` for `text-embedding-3-small`, `3072` for `text-embedding-3-large`), so changing the embedding model on a live route does not require any manual cleanup. A new index is created automatically for the new dimension; old entries from the previous model expire via the configured `semantic.ttl`.
 
 ### `BYPASS` does not refresh the cache
 

From 087356093c67686ca9a692e2fd4895f30bd6b4e5 Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Thu, 30 Apr 2026 07:47:58 +0800
Subject: [PATCH 12/38] test(ai-cache): add cache_key scope partitioning tests
 in dedicated file

---
 apisix/plugins/ai-cache/semantic.lua |   2 +-
 t/plugin/ai-cache-scope.t            | 355 +++++++++++++++++++++++++++
 2 files changed, 356 insertions(+), 1 deletion(-)
 create mode 100644 t/plugin/ai-cache-scope.t

diff --git a/apisix/plugins/ai-cache/semantic.lua b/apisix/plugins/ai-cache/semantic.lua
index 38055daad906..b44ee5675711 100644
--- a/apisix/plugins/ai-cache/semantic.lua
+++ b/apisix/plugins/ai-cache/semantic.lua
@@ -94,7 +94,7 @@ function _M.search(conf, scope_hash, embedding_vec, threshold)
     if scope_hash == "" then
         query = "*=>[KNN 1 @embedding $vec AS dist]"
     else
-        query = "@scope:{" .. scope_hash .. "} *=>[KNN 1 @embedding $vec AS dist]"
+        query = "@scope:{" .. scope_hash .. "}=>[KNN 1 @embedding $vec AS dist]"
     end
 
     local res, search_err = red["FT.SEARCH"](red,
diff --git a/t/plugin/ai-cache-scope.t b/t/plugin/ai-cache-scope.t
new file mode 100644
index 000000000000..97614fe173f8
--- /dev/null
+++ b/t/plugin/ai-cache-scope.t
@@ -0,0 +1,355 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+BEGIN {
+    $ENV{TEST_ENABLE_CONTROL_API_V1} = "0";
+}
+
+use t::APISIX 'no_plan';
+
+log_level("info");
+repeat_each(1);
+no_long_string();
+no_shuffle();
+no_root_location();
+
+add_block_preprocessor(sub {
+    my ($block) = @_;
+
+    if (!defined $block->request) {
+        $block->set_value("request", "GET /t");
+    }
+
+    if (!$block->error_log && !$block->no_error_log) {
+        $block->set_value("no_error_log", "[error]\n[alert]");
+    }
+
+    if (!defined $block->http_config) {
+        $block->set_value("http_config", <<_EOC_);
+server {
+    listen 1990;
+    default_type 'application/json';
+
+    location /v1/embeddings {
+        content_by_lua_block {
+            local fixture_loader = require("lib.fixture_loader")
+            local content, err = fixture_loader.load("openai/embeddings-list.json")
+            if not content then
+                ngx.status = 500
+                ngx.say(err)
+                return
+            end
+
+            ngx.status = 200
+            ngx.print(content)
+        }
+    }
+}
+_EOC_
+    }
+});
+
+run_tests();
+
+__DATA__
+
+=== TEST 1: set up route with cache_key include_vars
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/1',
+                ngx.HTTP_PUT,
+                [[{
+                    "uri": "/scoped",
+                    "plugins": {
+                        "ai-proxy": {
+                            "provider": "openai",
+                            "auth": {
+                                "header": {
+                                    "Authorization": "Bearer test-key"
+                                }
+                            },
+                            "override": {
+                                "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+                            }
+                        },
+                        "ai-cache": {
+                            "layers": ["exact"],
+                            "exact": { "ttl": 60 },
+                            "cache_key": {
+                                "include_vars": ["$http_x_tenant_id"]
+                            },
+                            "redis_host": "127.0.0.1"
+                        }
+                    }
+                }]]
+            )
+
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.say(body)
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 2: tenant-a first request - MISS
+--- request
+POST /scoped
+{"messages":[{"role":"user","content":"scope test prompt"}]}
+--- more_headers
+Content-Type: application/json
+X-Tenant-Id: tenant-a
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+
+
+
+=== TEST 3: tenant-b same prompt - MISS (proves cache_key partitioning)
+--- request
+POST /scoped
+{"messages":[{"role":"user","content":"scope test prompt"}]}
+--- more_headers
+Content-Type: application/json
+X-Tenant-Id: tenant-b
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- wait: 1
+
+
+
+=== TEST 4: tenant-a same prompt again - HIT-L1
+--- request
+POST /scoped
+{"messages":[{"role":"user","content":"scope test prompt"}]}
+--- more_headers
+Content-Type: application/json
+X-Tenant-Id: tenant-a
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L1
+
+
+
+=== TEST 5: set up consumers for include_consumer test
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+
+            local consumers = {
+                { username = "alice", key = "alice-key" },
+                { username = "bob",   key = "bob-key"   },
+            }
+
+            for _, c in ipairs(consumers) do
+                local code, body = t('/apisix/admin/consumers',
+                    ngx.HTTP_PUT,
+                    string.format([[{
+                        "username": "%s",
+                        "plugins": { "key-auth": { "key": "%s" } }
+                    }]], c.username, c.key)
+                )
+                if code >= 300 then
+                    ngx.status = code
+                    ngx.say(body)
+                    return
+                end
+            end
+            ngx.say("passed")
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 6: set up route with cache_key include_consumer + key-auth
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/2',
+                ngx.HTTP_PUT,
+                [[{
+                    "uri": "/per-consumer",
+                    "plugins": {
+                        "key-auth": {},
+                        "ai-proxy": {
+                            "provider": "openai",
+                            "auth": {
+                                "header": {
+                                    "Authorization": "Bearer test-key"
+                                }
+                            },
+                            "override": {
+                                "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+                            }
+                        },
+                        "ai-cache": {
+                            "layers": ["exact"],
+                            "exact": { "ttl": 60 },
+                            "cache_key": {
+                                "include_consumer": true
+                            },
+                            "redis_host": "127.0.0.1"
+                        }
+                    }
+                }]]
+            )
+
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.say(body)
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 7: alice first request - MISS
+--- request
+POST /per-consumer
+{"messages":[{"role":"user","content":"per-consumer prompt"}]}
+--- more_headers
+Content-Type: application/json
+apikey: alice-key
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- wait: 1
+
+
+
+=== TEST 8: bob same prompt - MISS (proves include_consumer partitioning)
+--- request
+POST /per-consumer
+{"messages":[{"role":"user","content":"per-consumer prompt"}]}
+--- more_headers
+Content-Type: application/json
+apikey: bob-key
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+
+
+
+=== TEST 9: set up route with L2 semantic + cache_key include_vars
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/3',
+                ngx.HTTP_PUT,
+                [[{
+                    "uri": "/scoped-semantic",
+                    "plugins": {
+                        "ai-proxy": {
+                            "provider": "openai",
+                            "auth": {
+                                "header": {
+                                    "Authorization": "Bearer test-key"
+                                }
+                            },
+                            "override": {
+                                "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+                            }
+                        },
+                        "ai-cache": {
+                            "layers": ["exact", "semantic"],
+                            "exact": { "ttl": 60 },
+                            "semantic": {
+                                "similarity_threshold": 0.90,
+                                "ttl": 300,
+                                "embedding": {
+                                    "provider": "openai",
+                                    "endpoint": "http://127.0.0.1:1990/v1/embeddings",
+                                    "api_key": "test-key"
+                                }
+                            },
+                            "cache_key": {
+                                "include_vars": ["$http_x_tenant_id"]
+                            },
+                            "redis_host": "127.0.0.1"
+                        }
+                    }
+                }]]
+            )
+
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.say(body)
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 10: tenant-a first request - MISS, writes to L2 with scope=hash(tenant-a)
+--- request
+POST /scoped-semantic
+{"messages":[{"role":"user","content":"What is the capital of France??"}]}
+--- more_headers
+Content-Type: application/json
+X-Tenant-Id: tenant-a
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- wait: 1
+
+
+
+=== TEST 11: tenant-b same prompt - MISS (FT.SEARCH scope filter excludes tenant-a's entry)
+--- request
+POST /scoped-semantic
+{"messages":[{"role":"user","content":"What is the capital of France??"}]}
+--- more_headers
+Content-Type: application/json
+X-Tenant-Id: tenant-b
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+
+
+
+=== TEST 12: tenant-a paraphrase - HIT-L2 (scope filter finds tenant-a's entry)
+--- request
+POST /scoped-semantic
+{"messages":[{"role":"user","content":"Name the capital city of France"}]}
+--- more_headers
+Content-Type: application/json
+X-Tenant-Id: tenant-a
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L2

From 9be2eccfe6a9fb64e182162a58d689f861264a8c Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Thu, 30 Apr 2026 07:58:06 +0800
Subject: [PATCH 13/38] feat(ai-cache): register plugin in default plugin list

---
 apisix/cli/config.lua | 1 +
 1 file changed, 1 insertion(+)

diff --git a/apisix/cli/config.lua b/apisix/cli/config.lua
index 956eef30c267..b4df580666c0 100644
--- a/apisix/cli/config.lua
+++ b/apisix/cli/config.lua
@@ -231,6 +231,7 @@ local _M = {
     "ai-prompt-template",
     "ai-prompt-decorator",
     "ai-prompt-guard",
+    "ai-cache",
     "ai-rag",
     "ai-rate-limiting",
     "ai-proxy-multi",

From d691ea28a247101c060a2eceefd8999030a1c87b Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Thu, 30 Apr 2026 08:16:32 +0800
Subject: [PATCH 14/38] fix(ai-cache): remove sort from compute_scope_hash to
 prevent cross-user cache collisions

---
 apisix/plugins/ai-cache/exact.lua | 2 --
 1 file changed, 2 deletions(-)

diff --git a/apisix/plugins/ai-cache/exact.lua b/apisix/plugins/ai-cache/exact.lua
index 4b4d36b157a6..a9766906508b 100644
--- a/apisix/plugins/ai-cache/exact.lua
+++ b/apisix/plugins/ai-cache/exact.lua
@@ -21,7 +21,6 @@ local resty_sha256 = require("resty.sha256")
 local to_hex       = require("resty.string").to_hex
 
 local table_concat  = table.concat
-local table_sort    = table.sort
 local ngx_time      = ngx.time
 local tostring      = tostring
 
@@ -67,7 +66,6 @@ function _M.compute_scope_hash(conf, ctx)
         return ""
     end
 
-    table_sort(parts)
     return sha256_hex(table_concat(parts, "|"))
 end
 

From 61ea49c7fb0db0760670eba7434432ebcb89fc5c Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Thu, 30 Apr 2026 08:19:08 +0800
Subject: [PATCH 15/38] feat(ai-cache): add
 apisix_ai_cache_embedding_failures_total metric

---
 apisix/plugins/ai-cache.lua            |  1 +
 apisix/plugins/prometheus/exporter.lua | 14 ++++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua
index 7caf6cf548ee..f1061272d819 100644
--- a/apisix/plugins/ai-cache.lua
+++ b/apisix/plugins/ai-cache.lua
@@ -139,6 +139,7 @@ function _M.access(conf, ctx)
         local embedding, _, emb_err = emb_driver.get_embeddings(emb_conf, prompt_text, httpc, true)
         if not embedding then
             core.log.warn("ai-cache: embedding fetch failed (degrading to MISS): ", emb_err)
+            ctx.ai_cache_embedding_failed = true
         else
             ctx.ai_cache_embedding_latency_ms = (ngx_now() - t0) * 1000
             ctx.ai_cache_embedding_provider = emb_conf.provider
diff --git a/apisix/plugins/prometheus/exporter.lua b/apisix/plugins/prometheus/exporter.lua
index cf008400b0f0..78ce1bac0bf5 100644
--- a/apisix/plugins/prometheus/exporter.lua
+++ b/apisix/plugins/prometheus/exporter.lua
@@ -166,6 +166,8 @@ function _M.http_init(prometheus_enabled_in_stream)
                                                             "ai_cache_misses", "expire")
     local ai_cache_embedding_latency_exptime = core.table.try_read_attr(attr, "metrics",
                                                             "ai_cache_embedding_latency", "expire")
+    local ai_cache_embedding_failures_exptime = core.table.try_read_attr(attr, "metrics",
+                                                            "ai_cache_embedding_failures", "expire")
 
     prometheus = base_prometheus.init("prometheus-metrics", metric_prefix)
 
@@ -289,6 +291,12 @@ function _M.http_init(prometheus_enabled_in_stream)
             ai_cache_embedding_latency_buckets,
             ai_cache_embedding_latency_exptime)
 
+    metrics.ai_cache_embedding_failures = prometheus:counter("ai_cache_embedding_failures_total",
+            "AI cache embedding API call failure count",
+            {"route_id", "service_id", "consumer",
+            unpack(extra_labels("ai_cache_embedding_failures"))},
+            ai_cache_embedding_failures_exptime)
+
     if prometheus_enabled_in_stream then
         init_stream_metrics()
     end
@@ -428,6 +436,12 @@ function _M.http_log(conf, ctx)
                     ctx.ai_cache_embedding_provider or "",
                     unpack(extra_labels("ai_cache_embedding_latency", ctx))))
         end
+
+        if ctx.ai_cache_embedding_failed then
+            metrics.ai_cache_embedding_failures:inc(1,
+                gen_arr(route_id, service_id, consumer_name,
+                    unpack(extra_labels("ai_cache_embedding_failures", ctx))))
+        end
     end
 end
 

From bf31bc83245648633e3382469577d6240acc4f6d Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Thu, 30 Apr 2026 08:28:48 +0800
Subject: [PATCH 16/38] docs(ai-cache): add Ingress Controller tabs to all
 examples

---
 docs/en/latest/plugins/ai-cache.md | 432 +++++++++++++++++++++++++++++
 1 file changed, 432 insertions(+)

diff --git a/docs/en/latest/plugins/ai-cache.md b/docs/en/latest/plugins/ai-cache.md
index 0571902bcedf..2831340ebd8d 100644
--- a/docs/en/latest/plugins/ai-cache.md
+++ b/docs/en/latest/plugins/ai-cache.md
@@ -162,6 +162,109 @@ Synchronize the configuration to the gateway:
 adc sync -f adc.yaml
 ```
 
+</TabItem>
+<TabItem value="ingress" label="Ingress Controller">
+
+<Tabs groupId="k8s-api">
+<TabItem value="gateway-api" label="Gateway API">
+
+Create a Route with the `ai-cache` and [ai-proxy](./ai-proxy.md) Plugins configured as such:
+
+```yaml title="ai-cache-ic.yaml"
+apiVersion: apisix.apache.org/v1alpha1
+kind: PluginConfig
+metadata:
+  namespace: aic
+  name: ai-cache-plugin-config
+spec:
+  plugins:
+    - name: ai-proxy
+      config:
+        provider: openai
+        auth:
+          header:
+            Authorization: "Bearer your-api-key"
+        options:
+          model: gpt-4
+    - name: ai-cache
+      config:
+        layers:
+          - exact
+        exact:
+          ttl: 3600
+        redis_host: redis-stack
+---
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+  namespace: aic
+  name: ai-cache-route
+spec:
+  parentRefs:
+    - name: apisix
+  rules:
+    - matches:
+        - path:
+            type: Exact
+            value: /anything
+          method: POST
+      filters:
+        - type: ExtensionRef
+          extensionRef:
+            group: apisix.apache.org
+            kind: PluginConfig
+            name: ai-cache-plugin-config
+```
+
+</TabItem>
+<TabItem value="ingress" label="APISIX Ingress Controller">
+
+Create a Route with the `ai-cache` and [ai-proxy](./ai-proxy.md) Plugins configured as such:
+
+```yaml title="ai-cache-ic.yaml"
+apiVersion: apisix.apache.org/v2
+kind: ApisixRoute
+metadata:
+  namespace: aic
+  name: ai-cache-route
+spec:
+  ingressClassName: apisix
+  http:
+    - name: ai-cache-route
+      match:
+        paths:
+          - /anything
+        methods:
+          - POST
+      plugins:
+        - name: ai-proxy
+          enable: true
+          config:
+            provider: openai
+            auth:
+              header:
+                Authorization: "Bearer your-api-key"
+            options:
+              model: gpt-4
+        - name: ai-cache
+          enable: true
+          config:
+            layers:
+              - exact
+            exact:
+              ttl: 3600
+            redis_host: redis-stack
+```
+
+</TabItem>
+</Tabs>
+
+Apply the configuration to your cluster:
+
+```shell
+kubectl apply -f ai-cache-ic.yaml
+```
+
 </TabItem>
 </Tabs>
 
@@ -334,6 +437,123 @@ Synchronize the configuration to the gateway:
 adc sync -f adc.yaml
 ```
 
+</TabItem>
+<TabItem value="ingress" label="Ingress Controller">
+
+<Tabs groupId="k8s-api">
+<TabItem value="gateway-api" label="Gateway API">
+
+```yaml title="ai-cache-ic.yaml"
+apiVersion: apisix.apache.org/v1alpha1
+kind: PluginConfig
+metadata:
+  namespace: aic
+  name: ai-cache-plugin-config
+spec:
+  plugins:
+    - name: ai-proxy
+      config:
+        provider: openai
+        auth:
+          header:
+            Authorization: "Bearer your-api-key"
+        options:
+          model: gpt-4
+    - name: ai-cache
+      config:
+        layers:
+          - exact
+          - semantic
+        exact:
+          ttl: 3600
+        semantic:
+          similarity_threshold: 0.92
+          ttl: 86400
+          embedding:
+            provider: openai
+            endpoint: https://api.openai.com/v1/embeddings
+            api_key: your-api-key
+            model: text-embedding-3-small
+        redis_host: redis-stack
+---
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+  namespace: aic
+  name: ai-cache-route
+spec:
+  parentRefs:
+    - name: apisix
+  rules:
+    - matches:
+        - path:
+            type: Exact
+            value: /anything
+          method: POST
+      filters:
+        - type: ExtensionRef
+          extensionRef:
+            group: apisix.apache.org
+            kind: PluginConfig
+            name: ai-cache-plugin-config
+```
+
+</TabItem>
+<TabItem value="ingress" label="APISIX Ingress Controller">
+
+```yaml title="ai-cache-ic.yaml"
+apiVersion: apisix.apache.org/v2
+kind: ApisixRoute
+metadata:
+  namespace: aic
+  name: ai-cache-route
+spec:
+  ingressClassName: apisix
+  http:
+    - name: ai-cache-route
+      match:
+        paths:
+          - /anything
+        methods:
+          - POST
+      plugins:
+        - name: ai-proxy
+          enable: true
+          config:
+            provider: openai
+            auth:
+              header:
+                Authorization: "Bearer your-api-key"
+            options:
+              model: gpt-4
+        - name: ai-cache
+          enable: true
+          config:
+            layers:
+              - exact
+              - semantic
+            exact:
+              ttl: 3600
+            semantic:
+              similarity_threshold: 0.92
+              ttl: 86400
+              embedding:
+                provider: openai
+                endpoint: https://api.openai.com/v1/embeddings
+                api_key: your-api-key
+                model: text-embedding-3-small
+            redis_host: redis-stack
+```
+
+</TabItem>
+</Tabs>
+
+Apply the configuration to your cluster:
+
+```shell
+kubectl apply -f ai-cache-ic.yaml
+```
+
 </TabItem>
 </Tabs>
 
@@ -490,6 +710,113 @@ Synchronize the configuration to the gateway:
 adc sync -f adc.yaml
 ```
 
+</TabItem>
+<TabItem value="ingress" label="Ingress Controller">
+
+<Tabs groupId="k8s-api">
+<TabItem value="gateway-api" label="Gateway API">
+
+```yaml title="ai-cache-ic.yaml"
+apiVersion: apisix.apache.org/v1alpha1
+kind: PluginConfig
+metadata:
+  namespace: aic
+  name: ai-cache-plugin-config
+spec:
+  plugins:
+    - name: ai-proxy
+      config:
+        provider: openai
+        auth:
+          header:
+            Authorization: "Bearer your-api-key"
+        options:
+          model: gpt-4
+    - name: ai-cache
+      config:
+        layers:
+          - exact
+        exact:
+          ttl: 3600
+        cache_key:
+          include_consumer: true
+          include_vars:
+            - "$http_x_tenant_id"
+        redis_host: redis-stack
+---
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+  namespace: aic
+  name: ai-cache-route
+spec:
+  parentRefs:
+    - name: apisix
+  rules:
+    - matches:
+        - path:
+            type: Exact
+            value: /anything
+          method: POST
+      filters:
+        - type: ExtensionRef
+          extensionRef:
+            group: apisix.apache.org
+            kind: PluginConfig
+            name: ai-cache-plugin-config
+```
+
+</TabItem>
+<TabItem value="ingress" label="APISIX Ingress Controller">
+
+```yaml title="ai-cache-ic.yaml"
+apiVersion: apisix.apache.org/v2
+kind: ApisixRoute
+metadata:
+  namespace: aic
+  name: ai-cache-route
+spec:
+  ingressClassName: apisix
+  http:
+    - name: ai-cache-route
+      match:
+        paths:
+          - /anything
+        methods:
+          - POST
+      plugins:
+        - name: ai-proxy
+          enable: true
+          config:
+            provider: openai
+            auth:
+              header:
+                Authorization: "Bearer your-api-key"
+            options:
+              model: gpt-4
+        - name: ai-cache
+          enable: true
+          config:
+            layers:
+              - exact
+            exact:
+              ttl: 3600
+            cache_key:
+              include_consumer: true
+              include_vars:
+                - "$http_x_tenant_id"
+            redis_host: redis-stack
+```
+
+</TabItem>
+</Tabs>
+
+Apply the configuration to your cluster:
+
+```shell
+kubectl apply -f ai-cache-ic.yaml
+```
+
 </TabItem>
 </Tabs>
 
@@ -569,6 +896,111 @@ Synchronize the configuration to the gateway:
 adc sync -f adc.yaml
 ```
 
+</TabItem>
+<TabItem value="ingress" label="Ingress Controller">
+
+<Tabs groupId="k8s-api">
+<TabItem value="gateway-api" label="Gateway API">
+
+```yaml title="ai-cache-ic.yaml"
+apiVersion: apisix.apache.org/v1alpha1
+kind: PluginConfig
+metadata:
+  namespace: aic
+  name: ai-cache-plugin-config
+spec:
+  plugins:
+    - name: ai-proxy
+      config:
+        provider: openai
+        auth:
+          header:
+            Authorization: "Bearer your-api-key"
+        options:
+          model: gpt-4
+    - name: ai-cache
+      config:
+        layers:
+          - exact
+        exact:
+          ttl: 3600
+        bypass_on:
+          - header: X-Cache-Bypass
+            equals: "1"
+        redis_host: redis-stack
+---
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+  namespace: aic
+  name: ai-cache-route
+spec:
+  parentRefs:
+    - name: apisix
+  rules:
+    - matches:
+        - path:
+            type: Exact
+            value: /anything
+          method: POST
+      filters:
+        - type: ExtensionRef
+          extensionRef:
+            group: apisix.apache.org
+            kind: PluginConfig
+            name: ai-cache-plugin-config
+```
+
+</TabItem>
+<TabItem value="ingress" label="APISIX Ingress Controller">
+
+```yaml title="ai-cache-ic.yaml"
+apiVersion: apisix.apache.org/v2
+kind: ApisixRoute
+metadata:
+  namespace: aic
+  name: ai-cache-route
+spec:
+  ingressClassName: apisix
+  http:
+    - name: ai-cache-route
+      match:
+        paths:
+          - /anything
+        methods:
+          - POST
+      plugins:
+        - name: ai-proxy
+          enable: true
+          config:
+            provider: openai
+            auth:
+              header:
+                Authorization: "Bearer your-api-key"
+            options:
+              model: gpt-4
+        - name: ai-cache
+          enable: true
+          config:
+            layers:
+              - exact
+            exact:
+              ttl: 3600
+            bypass_on:
+              - header: X-Cache-Bypass
+                equals: "1"
+            redis_host: redis-stack
+```
+
+</TabItem>
+</Tabs>
+
+Apply the configuration to your cluster:
+
+```shell
+kubectl apply -f ai-cache-ic.yaml
+```
+
 </TabItem>
 </Tabs>
 

From 803f741aba356c44e2c3856d2971ba5d2fb4583e Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Thu, 30 Apr 2026 08:33:34 +0800
Subject: [PATCH 17/38] test(ai-cache): add multi-rule bypass_on test cases

---
 t/plugin/ai-cache.t | 105 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 89 insertions(+), 16 deletions(-)

diff --git a/t/plugin/ai-cache.t b/t/plugin/ai-cache.t
index e5e012828db9..5b3a6f654c6c 100644
--- a/t/plugin/ai-cache.t
+++ b/t/plugin/ai-cache.t
@@ -323,7 +323,80 @@ X-AI-Cache-Status: MISS
 
 
 
-=== TEST 12: set up route for 4xx test
+=== TEST 12: set up route with two bypass rules
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/1',
+                ngx.HTTP_PUT,
+                [[{
+                    "uri": "/chat",
+                    "plugins": {
+                        "ai-proxy": {
+                            "provider": "openai",
+                            "auth": {
+                                "header": {
+                                    "Authorization": "Bearer test-key"
+                                }
+                            },
+                            "override": {
+                                "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+                            }
+                        },
+                        "ai-cache": {
+                            "layers": ["exact"],
+                            "exact": { "ttl": 60 },
+                            "redis_host": "127.0.0.1",
+                            "bypass_on": [
+                                {"header": "X-Cache-Bypass", "equals": "1"},
+                                {"header": "X-Debug",        "equals": "true"}
+                            ]
+                        }
+                    }
+                }]]
+            )
+
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.say(body)
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 13: first bypass rule matches - BYPASS
+--- request
+POST /chat
+{"messages":[{"role":"user","content":"multi-rule bypass test"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+X-Cache-Bypass: 1
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: BYPASS
+
+
+
+=== TEST 14: second bypass rule matches - BYPASS
+--- request
+POST /chat
+{"messages":[{"role":"user","content":"multi-rule bypass test"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+X-Debug: true
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: BYPASS
+
+
+
+=== TEST 15: set up route for 4xx test
 --- config
     location /t {
         content_by_lua_block {
@@ -364,7 +437,7 @@ passed
 
 
 
-=== TEST 13: 4xx from upstream - not cached
+=== TEST 16: 4xx from upstream - not cached
 --- request
 POST /error
 {"messages":[{"role":"user","content":"trigger an error please"}]}
@@ -378,7 +451,7 @@ X-AI-Cache-Status: MISS
 
 
 
-=== TEST 14: same prompt after 4xx - still MISS (4xx was not cached)
+=== TEST 17: same prompt after 4xx - still MISS (4xx was not cached)
 --- request
 POST /error
 {"messages":[{"role":"user","content":"trigger an error please"}]}
@@ -392,7 +465,7 @@ X-AI-Cache-Status: MISS
 
 
 
-=== TEST 15: openai driver - parses embedding vector correctly
+=== TEST 18: openai driver - parses embedding vector correctly
 --- http_config
 server {
     listen 1990;
@@ -453,7 +526,7 @@ ok: 0.1 0.2 0.3
 
 
 
-=== TEST 16: openai driver - 429 from API return nil with status
+=== TEST 19: openai driver - 429 from API return nil with status
 --- http_config
 server {
     listen 1990;
@@ -492,7 +565,7 @@ status: 429
 
 
 
-=== TEST 17: azure_openai driver - parses embedding vector correctly
+=== TEST 20: azure_openai driver - parses embedding vector correctly
 --- http_config
 server {
     listen 1990;
@@ -544,7 +617,7 @@ ok: 0.4 0.5 0.6
 
 
 
-=== TEST 18: openai driver - 500 from API returns nil with status
+=== TEST 21: openai driver - 500 from API returns nil with status
 --- http_config
 server {
     listen 1990;
@@ -583,7 +656,7 @@ status: 500
 
 
 
-=== TEST 19: clean up L2 state before semantic tests
+=== TEST 22: clean up L2 state before semantic tests
 --- config
     location /t {
         content_by_lua_block {
@@ -609,7 +682,7 @@ ok
 
 
 
-=== TEST 20: set up route for L2 semantic cache tests
+=== TEST 23: set up route for L2 semantic cache tests
 --- config
     location /t {
         content_by_lua_block {
@@ -660,7 +733,7 @@ passed
 
 
 
-=== TEST 21: L2 - first request, cache MISS, stored in L2
+=== TEST 24: L2 - first request, cache MISS, stored in L2
 --- request
 POST /semantic
 {"messages":[{"role":"user","content":"What is the capital of France??"}]}
@@ -673,7 +746,7 @@ X-AI-Cache-Status: MISS
 
 
 
-=== TEST 22: L2 - different wording hits L2 (same vector from fixture)
+=== TEST 25: L2 - different wording hits L2 (same vector from fixture)
 --- request
 POST /semantic
 {"messages":[{"role":"user","content":"Name the capital city of France"}]}
@@ -690,7 +763,7 @@ ai-cache: L2 hit
 
 
 
-=== TEST 23: L2 - original prompt now hits L1 (backfilled by the L2 hit)
+=== TEST 26: L2 - original prompt now hits L1 (backfilled by the L2 hit)
 --- request
 POST /semantic
 {"messages":[{"role":"user","content":"What is the capital of France??"}]}
@@ -705,7 +778,7 @@ ai-cache: L1 hit for key
 
 
 
-=== TEST 24: L2 degradation - search error results in MISS, not 500
+=== TEST 27: L2 degradation - search error results in MISS, not 500
 --- config
     location /t {
         content_by_lua_block {
@@ -729,7 +802,7 @@ qr/degraded gracefully|miss, no error/
 
 
 
-=== TEST 25: streaming MISS - upstream called, response cached via log phase
+=== TEST 28: streaming MISS - upstream called, response cached via log phase
 --- request
 POST /chat
 {"messages":[{"role":"user","content":"Stream me something cool"}],"stream":true}
@@ -742,7 +815,7 @@ X-AI-Cache-Status: MISS
 
 
 
-=== TEST 26: streaming HIT - Content-Type is text/event-stream, SSE body returned
+=== TEST 29: streaming HIT - Content-Type is text/event-stream, SSE body returned
 --- request
 POST /chat
 {"messages":[{"role":"user","content":"Stream me something cool"}],"stream":true}
@@ -758,7 +831,7 @@ qr/data:.*content/
 
 
 
-=== TEST 27: non-streaming HIT after streaming MISS - returns JSON
+=== TEST 30: non-streaming HIT after streaming MISS - returns JSON
 --- request
 POST /chat
 {"messages":[{"role":"user","content":"Stream me something cool"}]}

From 0c268703f99f7f83d08d98d51029b39ccd584458 Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Thu, 30 Apr 2026 08:35:24 +0800
Subject: [PATCH 18/38] docs(ai-cache): note that bypass header is
 unauthenticated and should be gated

---
 docs/en/latest/plugins/ai-cache.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/en/latest/plugins/ai-cache.md b/docs/en/latest/plugins/ai-cache.md
index 2831340ebd8d..9bd98b4f5f57 100644
--- a/docs/en/latest/plugins/ai-cache.md
+++ b/docs/en/latest/plugins/ai-cache.md
@@ -1078,6 +1078,8 @@ The Plugin namespaces the L2 index and entries by embedding dimension (for examp
 
 A request with the bypass header reaches the upstream but its response is not written back. Use it to force a fresh upstream call without invalidating or replacing the existing cached entry.
 
+The bypass header is not authenticated — any client that can set the configured header and value can bypass the cache. In production, gate access using an APISIX authentication plugin such as `key-auth` or `ip-restriction`, or restrict the header at your upstream WAF.
+
 ### The semantic layer requires Redis Stack
 
 The `FT.CREATE` and `FT.SEARCH` commands used by the semantic layer come from the RediSearch module. Vanilla Redis will fail these commands and the layer will silently degrade to `MISS`. Use a Redis Stack image such as `redis/redis-stack:latest`.

From c345fe01c7111f5b15da156d9a87470bc450860a Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Fri, 1 May 2026 00:07:00 +0800
Subject: [PATCH 19/38] chore(ai-cache): fix CI failures across lint, eclint,
 misc-checker, and admin tests

---
 Makefile                                        |  5 +++++
 apisix/plugins/ai-cache.lua                     | 17 ++++++++++-------
 .../ai-cache/embeddings/azure_openai.lua        |  4 +++-
 apisix/plugins/ai-cache/embeddings/openai.lua   |  4 +++-
 apisix/plugins/ai-cache/exact.lua               |  4 +++-
 apisix/plugins/ai-cache/schema.lua              |  2 +-
 t/admin/plugins.t                               |  1 +
 t/plugin/ai-cache.t                             | 16 ++++++++--------
 8 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/Makefile b/Makefile
index 71ab7df1eabf..ad074ecabbdf 100644
--- a/Makefile
+++ b/Makefile
@@ -388,6 +388,11 @@ install: runtime
 	$(ENV_INSTALL) -d $(ENV_INST_LUADIR)/apisix/plugins/ai-transport
 	$(ENV_INSTALL) apisix/plugins/ai-transport/*.lua $(ENV_INST_LUADIR)/apisix/plugins/ai-transport
 
+	$(ENV_INSTALL) -d $(ENV_INST_LUADIR)/apisix/plugins/ai-cache
+	$(ENV_INSTALL) apisix/plugins/ai-cache/*.lua $(ENV_INST_LUADIR)/apisix/plugins/ai-cache
+	$(ENV_INSTALL) -d $(ENV_INST_LUADIR)/apisix/plugins/ai-cache/embeddings
+	$(ENV_INSTALL) apisix/plugins/ai-cache/embeddings/*.lua $(ENV_INST_LUADIR)/apisix/plugins/ai-cache/embeddings
+
 	$(ENV_INSTALL) -d $(ENV_INST_LUADIR)/apisix/plugins/ai-rag/embeddings
 	$(ENV_INSTALL) apisix/plugins/ai-rag/embeddings/*.lua $(ENV_INST_LUADIR)/apisix/plugins/ai-rag/embeddings
 	$(ENV_INSTALL) -d $(ENV_INST_LUADIR)/apisix/plugins/ai-rag/vector-search
diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua
index f1061272d819..2c8942a9f791 100644
--- a/apisix/plugins/ai-cache.lua
+++ b/apisix/plugins/ai-cache.lua
@@ -21,9 +21,12 @@ local exact     = require("apisix.plugins.ai-cache.exact")
 local semantic  = require("apisix.plugins.ai-cache.semantic")
 local protocols = require("apisix.plugins.ai-protocols")
 local http      = require("resty.http")
-local ngx_time  = ngx.time
-local ngx_now   = ngx.now
-local tostring  = tostring
+local ngx          = ngx
+local ngx_time     = ngx.time
+local ngx_now      = ngx.now
+local ipairs       = ipairs
+local require      = require
+local tostring     = tostring
 local table_concat = table.concat
 
 local plugin_name = "ai-cache"
@@ -122,10 +125,10 @@ function _M.access(conf, ctx)
             else
                 core.response.set_header("Content-Type", "application/json")
             end
-            return core.response.exit(200, proto.build_deny_response({
+            return 200, proto.build_deny_response({
                 stream = is_stream,
                 text = cached_text,
-            }))
+            })
         end
     end
 
@@ -169,10 +172,10 @@ function _M.access(conf, ctx)
                 else
                     core.response.set_header("Content-Type", "application/json")
                 end
-                return core.response.exit(200, proto.build_deny_response({
+                return 200, proto.build_deny_response({
                     stream = is_stream,
                     text = cached_text,
-                }))
+                })
             end
         end
     end
diff --git a/apisix/plugins/ai-cache/embeddings/azure_openai.lua b/apisix/plugins/ai-cache/embeddings/azure_openai.lua
index a52c5e265497..928a803f948e 100644
--- a/apisix/plugins/ai-cache/embeddings/azure_openai.lua
+++ b/apisix/plugins/ai-cache/embeddings/azure_openai.lua
@@ -16,8 +16,10 @@
 --
 
 local core = require("apisix.core")
+local type = type
 
-local HTTP_OK = ngx.HTTP_OK
+local ngx                        = ngx
+local HTTP_OK                    = ngx.HTTP_OK
 local HTTP_INTERNAL_SERVER_ERROR = ngx.HTTP_INTERNAL_SERVER_ERROR
 
 local _M = {}
diff --git a/apisix/plugins/ai-cache/embeddings/openai.lua b/apisix/plugins/ai-cache/embeddings/openai.lua
index ff50f2bbea27..0ca8c7cd61af 100644
--- a/apisix/plugins/ai-cache/embeddings/openai.lua
+++ b/apisix/plugins/ai-cache/embeddings/openai.lua
@@ -16,8 +16,10 @@
 --
 
 local core = require("apisix.core")
+local type = type
 
-local HTTP_OK = ngx.HTTP_OK
+local ngx                        = ngx
+local HTTP_OK                    = ngx.HTTP_OK
 local HTTP_INTERNAL_SERVER_ERROR = ngx.HTTP_INTERNAL_SERVER_ERROR
 
 local _M = {}
diff --git a/apisix/plugins/ai-cache/exact.lua b/apisix/plugins/ai-cache/exact.lua
index a9766906508b..d442cd89c467 100644
--- a/apisix/plugins/ai-cache/exact.lua
+++ b/apisix/plugins/ai-cache/exact.lua
@@ -20,9 +20,11 @@ local redis        = require("apisix.utils.redis")
 local resty_sha256 = require("resty.sha256")
 local to_hex       = require("resty.string").to_hex
 
-local table_concat  = table.concat
+local ngx           = ngx
 local ngx_time      = ngx.time
+local ipairs        = ipairs
 local tostring      = tostring
+local table_concat  = table.concat
 
 local KEY_PREFIX = "ai-cache:l1:"
 
diff --git a/apisix/plugins/ai-cache/schema.lua b/apisix/plugins/ai-cache/schema.lua
index 444858cc9067..9416af97f638 100644
--- a/apisix/plugins/ai-cache/schema.lua
+++ b/apisix/plugins/ai-cache/schema.lua
@@ -124,4 +124,4 @@ _M.schema = {
     encrypt_fields = { "semantic.embedding.api_key", "redis_password" },
 }
 
-return _M
\ No newline at end of file
+return _M
diff --git a/t/admin/plugins.t b/t/admin/plugins.t
index adb98b28bc17..1454ec145eb0 100644
--- a/t/admin/plugins.t
+++ b/t/admin/plugins.t
@@ -98,6 +98,7 @@ ai-request-rewrite
 ai-prompt-guard
 ai-prompt-template
 ai-prompt-decorator
+ai-cache
 ai-rag
 ai-aws-content-moderation
 ai-proxy-multi
diff --git a/t/plugin/ai-cache.t b/t/plugin/ai-cache.t
index 5b3a6f654c6c..1657fbec3d76 100644
--- a/t/plugin/ai-cache.t
+++ b/t/plugin/ai-cache.t
@@ -1,18 +1,18 @@
 #
-# Licensed to the Apache Software Foundation (ASF) under one or more                                                                   
+# Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0                                                              
+# The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at                                                                                
-#               
+# the License.  You may obtain a copy of the License at
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-#                                                                                                                                      
+#
 # Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,                                                                    
+# distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License.                                                                                                       
+# limitations under the License.
 #
 
 BEGIN {
@@ -83,7 +83,7 @@ __DATA__
                 ngx.say("failed")
             else
                 ngx.say("passed")
-            end 
+            end
         }
     }
 --- response_body

From e665d2a4830c1076e49642764c55d47e009e321b Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Fri, 1 May 2026 05:06:56 +0800
Subject: [PATCH 20/38] docs(ai-cache): document top_k and fix HTTPS endpoint
 wording

---
 docs/en/latest/plugins/ai-cache.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/en/latest/plugins/ai-cache.md b/docs/en/latest/plugins/ai-cache.md
index 9bd98b4f5f57..3c1789131134 100644
--- a/docs/en/latest/plugins/ai-cache.md
+++ b/docs/en/latest/plugins/ai-cache.md
@@ -47,9 +47,10 @@ The Plugin should be used together with [ai-proxy](./ai-proxy.md) or [ai-proxy-m
 | `layers` | array[string] | False | `["exact", "semantic"]` | `"exact"`, `"semantic"` | Cache layers to enable, queried in order. |
 | `exact.ttl` | integer | False | `3600` | ≥ 1 | Time-to-live in seconds for exact-layer entries. |
 | `semantic.similarity_threshold` | number | False | `0.95` | 0–1 | Minimum cosine similarity required for a semantic-layer hit. |
+| `semantic.top_k` | integer | False | `1` | ≥ 1 | Number of nearest-neighbor candidates the index returns; the first candidate above `similarity_threshold` is used. |
 | `semantic.ttl` | integer | False | `86400` | ≥ 1 | Time-to-live in seconds for semantic-layer entries. |
 | `semantic.embedding.provider` | string | True (if semantic enabled) | | `"openai"`, `"azure_openai"` | Embedding API provider. |
-| `semantic.embedding.endpoint` | string | True (if semantic enabled) | | | HTTPS URL of the embedding API. |
+| `semantic.embedding.endpoint` | string | True (if semantic enabled) | | | Embedding API endpoint URL. |
 | `semantic.embedding.api_key` | string | True (if semantic enabled) | | | API key for the embedding provider. Stored encrypted. |
 | `semantic.embedding.model` | string | False | | | Embedding model name. Uses provider default if omitted. |
 | `cache_key.include_consumer` | boolean | False | `false` | | If `true`, partition the cache by consumer name. |

From 6b996cd9de2988ff83790b02176b340d59156178 Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Fri, 1 May 2026 05:12:30 +0800
Subject: [PATCH 21/38] feat(ai-cache): wire semantic.top_k through L2 vector
 search

---
 apisix/plugins/ai-cache/semantic.lua | 54 +++++++++++++++-------------
 1 file changed, 29 insertions(+), 25 deletions(-)

diff --git a/apisix/plugins/ai-cache/semantic.lua b/apisix/plugins/ai-cache/semantic.lua
index b44ee5675711..4d4cf1c8dedd 100644
--- a/apisix/plugins/ai-cache/semantic.lua
+++ b/apisix/plugins/ai-cache/semantic.lua
@@ -89,12 +89,15 @@ function _M.search(conf, scope_hash, embedding_vec, threshold)
     end
 
     local binary_vec = pack_vector(embedding_vec)
+    local top_k = (conf.semantic and conf.semantic.top_k) or 1
+    local top_k_str = tostring(top_k)
 
     local query
     if scope_hash == "" then
-        query = "*=>[KNN 1 @embedding $vec AS dist]"
+        query = "*=>[KNN " .. top_k_str .. " @embedding $vec AS dist]"
     else
-        query = "@scope:{" .. scope_hash .. "}=>[KNN 1 @embedding $vec AS dist]"
+        query = "@scope:{" .. scope_hash .. "}=>[KNN " .. top_k_str
+                .. " @embedding $vec AS dist]"
     end
 
     local res, search_err = red["FT.SEARCH"](red,
@@ -102,7 +105,7 @@ function _M.search(conf, scope_hash, embedding_vec, threshold)
         query,
         "PARAMS", "2", "vec", binary_vec,
         "SORTBY", "dist", "ASC",
-        "LIMIT", "0", "1",
+        "LIMIT", "0", top_k_str,
         "RETURN", "2", "response", "dist",
         "DIALECT", "2"
     )
@@ -116,31 +119,32 @@ function _M.search(conf, scope_hash, embedding_vec, threshold)
         return nil, nil, nil
     end
 
-    -- RESP2: {count, key, {field, val, field, val, ...}, ...}
-    local fields = res[3]
-    if type(fields) ~= "table" then
-        return nil, nil, nil
-    end
-
-    local response_text, dist
-    for i = 1, #fields, 2 do
-        if fields[i] == "response" then
-            response_text = fields[i + 1]
-        elseif fields[i] == "dist" then
-            dist = tonumber(fields[i + 1])
+    -- RESP2: {count, key1, fields1, key2, fields2, ...}
+    -- Results are sorted by dist ASC. Iterate candidates and return the first
+    -- one whose similarity meets the threshold; skip candidates with missing
+    -- or corrupt fields.
+    for i = 3, #res, 2 do
+        local fields = res[i]
+        if type(fields) == "table" then
+            local response_text, dist
+            for j = 1, #fields, 2 do
+                if fields[j] == "response" then
+                    response_text = fields[j + 1]
+                elseif fields[j] == "dist" then
+                    dist = tonumber(fields[j + 1])
+                end
+            end
+
+            if response_text and dist then
+                local similarity = 1 - dist
+                if similarity >= threshold then
+                    return response_text, similarity, nil
+                end
+            end
         end
     end
 
-    if not response_text or not dist then
-        return nil, nil, nil
-    end
-
-    local similarity = 1 - dist
-    if similarity < threshold then
-        return nil, nil, nil
-    end
-
-    return response_text, similarity, nil
+    return nil, nil, nil
 end
 
 

From 57687ca16155a7289c4608f0392ef7f450658c69 Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Fri, 1 May 2026 05:18:07 +0800
Subject: [PATCH 22/38] fix(ai-cache): guard log() against nil cache key fields
 on early-MISS

---
 apisix/plugins/ai-cache.lua | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua
index 2c8942a9f791..cbfd37cbd846 100644
--- a/apisix/plugins/ai-cache.lua
+++ b/apisix/plugins/ai-cache.lua
@@ -215,6 +215,12 @@ function _M.log(conf, ctx)
         return
     end
 
+    -- Early-MISS paths (body parse / protocol detect / hash failure) skip
+    -- key computation, so bail out if cache key fields are absent.
+    if not ctx.ai_cache_prompt_hash or not ctx.ai_cache_prompt_text then
+        return
+    end
+
     local upstream_status = core.response.get_upstream_status(ctx) or ngx.status
     if not upstream_status or upstream_status < 200 or upstream_status >= 300 then
         return

From 1e42c8672748c04dae1468f14e26d078b4bf91f4 Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Fri, 1 May 2026 06:41:47 +0800
Subject: [PATCH 23/38] docs(ai-cache): mark build_deny_response usage as a
 follow-up rename

---
 apisix/plugins/ai-cache.lua | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua
index cbfd37cbd846..a095081f9f20 100644
--- a/apisix/plugins/ai-cache.lua
+++ b/apisix/plugins/ai-cache.lua
@@ -125,6 +125,9 @@ function _M.access(conf, ctx)
             else
                 core.response.set_header("Content-Type", "application/json")
             end
+            -- TODO: rename build_deny_response to build_response_from_text in a
+            -- follow-up. We use it here to wrap cached text in the protocol's
+            -- response shape, not for policy denial.
             return 200, proto.build_deny_response({
                 stream = is_stream,
                 text = cached_text,

From 3349acc060552151e9bb278636018cf8744acb2f Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Tue, 5 May 2026 02:10:44 +0800
Subject: [PATCH 24/38] test(prometheus): expand ai-cache metric coverage and
 reorganize tests

---
 t/plugin/prometheus-ai-cache.t | 188 +++++++++++++++++++++++++++++++--
 1 file changed, 178 insertions(+), 10 deletions(-)

diff --git a/t/plugin/prometheus-ai-cache.t b/t/plugin/prometheus-ai-cache.t
index fc4eb02264bc..02544fe97df7 100644
--- a/t/plugin/prometheus-ai-cache.t
+++ b/t/plugin/prometheus-ai-cache.t
@@ -49,6 +49,7 @@ plugins:
   - ai-cache
   - prometheus
   - public-api
+  - key-auth
 _EOC_
     $block->set_value("extra_yaml_config", $user_yaml_config);
 
@@ -72,6 +73,13 @@ server {
             ngx.print(content)
         }
     }
+
+    location /v1/embeddings-fail {
+        content_by_lua_block {
+            ngx.status = 500
+            ngx.say('{"error":"simulated embedding failure"}')
+        }
+    }
 }
 _EOC_
     }
@@ -91,7 +99,7 @@ __DATA__
                 {
                     url = "/apisix/admin/routes/1",
                     data = [[{
-                        "uri": "/chat",
+                        "uri": "/exact",
                         "plugins": {
                             "prometheus": {},
                             "ai-proxy": {
@@ -148,6 +156,76 @@ __DATA__
                         }
                     }]],
                 },
+                {
+                    url = "/apisix/admin/routes/3",
+                    data = [[{
+                        "uri": "/semantic-fail",
+                        "plugins": {
+                            "prometheus": {},
+                            "ai-proxy": {
+                                "provider": "openai",
+                                "auth": {
+                                    "header": {
+                                        "Authorization": "Bearer test-key"
+                                    }
+                                },
+                                "override": {
+                                    "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+                                }
+                            },
+                            "ai-cache": {
+                                "layers": ["semantic"],
+                                "semantic": {
+                                    "similarity_threshold": 0.90,
+                                    "ttl": 300,
+                                    "embedding": {
+                                        "provider": "openai",
+                                        "endpoint": "http://127.0.0.1:1990/v1/embeddings-fail",
+                                        "api_key": "test-key"
+                                    }
+                                },
+                                "redis_host": "127.0.0.1"
+                            }
+                        }
+                    }]],
+                },
+                {
+                    url = "/apisix/admin/routes/4",
+                    data = [[{
+                        "uri": "/exact-auth",
+                        "plugins": {
+                            "prometheus": {},
+                            "key-auth": {},
+                            "ai-proxy": {
+                                "provider": "openai",
+                                "auth": {
+                                    "header": {
+                                        "Authorization": "Bearer test-key"
+                                    }
+                                },
+                                "override": {
+                                    "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+                                }
+                            },
+                            "ai-cache": {
+                                "layers": ["exact"],
+                                "exact": { "ttl": 60 },
+                                "redis_host": "127.0.0.1"
+                            }
+                        }
+                    }]],
+                },
+                {
+                    url = "/apisix/admin/consumers",
+                    data = [[{
+                        "username": "alice",
+                        "plugins": {
+                            "key-auth": {
+                                "key": "alice-key"
+                            }
+                        }
+                    }]],
+                },
                 {
                     url = "/apisix/admin/routes/metrics",
                     data = [[{
@@ -169,13 +247,13 @@ __DATA__
         }
     }
 --- response_body eval
-"passed\n" x 3
+"passed\n" x 6
 
 
 
 === TEST 2: MISS request - upstream called
 --- request
-POST /chat
+POST /exact
 {"messages":[{"role":"user","content":"What is the meaning of life?"}]}
 --- more_headers
 Content-Type: application/json
@@ -188,7 +266,7 @@ X-AI-Cache-Status: MISS
 
 === TEST 3: same request - HIT-L1
 --- request
-POST /chat
+POST /exact
 {"messages":[{"role":"user","content":"What is the meaning of life?"}]}
 --- more_headers
 Content-Type: application/json
@@ -217,7 +295,7 @@ qr/apisix_ai_cache_hits_total\{route_id="1",service_id="",consumer="",layer="l1"
 
 === TEST 6: BYPASS request - upstream called, no cache interaction
 --- request
-POST /chat
+POST /exact
 {"messages":[{"role":"user","content":"What is the meaning of life?"}]}
 --- more_headers
 Content-Type: application/json
@@ -237,7 +315,15 @@ qr/apisix_ai_cache_misses_total\{route_id="1",service_id="",consumer=""\} 1\n/
 
 
 
-=== TEST 8: cleanup Redis L2 state before semantic tests
+=== TEST 8: verify BYPASS did not increment hits counter
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_hits_total\{route_id="1",service_id="",consumer="",layer="l1"\} 1\n/
+
+
+
+=== TEST 9: cleanup Redis L2 state before semantic tests
 --- config
     location /t {
         content_by_lua_block {
@@ -262,7 +348,7 @@ ok
 
 
 
-=== TEST 9: L2 first request - MISS, embedding API called
+=== TEST 10: L2 first request - MISS, embedding API called
 --- request
 POST /semantic
 {"messages":[{"role":"user","content":"What is the capital of France??"}]}
@@ -276,7 +362,7 @@ X-AI-Cache-Status: MISS
 
 
 
-=== TEST 10: L2 second request - different wording, HIT-L2
+=== TEST 11: L2 second request - different wording, HIT-L2
 --- request
 POST /semantic
 {"messages":[{"role":"user","content":"Name the capital city of France"}]}
@@ -290,7 +376,15 @@ X-AI-Cache-Status: HIT-L2
 
 
 
-=== TEST 11: verify hits counter with layer="l2"
+=== TEST 12: verify miss counter for semantic route (route_id=2)
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_misses_total\{route_id="2",service_id="",consumer=""\} 1/
+
+
+
+=== TEST 13: verify hits counter with layer="l2"
 --- request
 GET /apisix/prometheus/metrics
 --- response_body_like eval
@@ -298,8 +392,82 @@ qr/apisix_ai_cache_hits_total\{route_id="2",service_id="",consumer="",layer="l2"
 
 
 
-=== TEST 12: verify embedding latency histogram with provider label
+=== TEST 14: verify embedding latency histogram with provider label
 --- request
 GET /apisix/prometheus/metrics
 --- response_body_like eval
 qr/apisix_ai_cache_embedding_latency_count\{route_id="2",service_id="",consumer="",provider="openai"\} 2/
+
+
+
+=== TEST 15: embedding failure - request still returns 200 via fallback
+--- request
+POST /semantic-fail
+{"messages":[{"role":"user","content":"What does this fail at?"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- wait: 1
+
+
+
+=== TEST 16: verify embedding_failures counter
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_embedding_failures_total\{route_id="3",service_id="",consumer=""\} 1/
+
+
+
+=== TEST 17: verify embedding-failure request also counted as miss
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_misses_total\{route_id="3",service_id="",consumer=""\} 1/
+
+
+
+=== TEST 18: authenticated MISS request - consumer alice
+--- request
+POST /exact-auth
+{"messages":[{"role":"user","content":"Authenticated cache test"}]}
+--- more_headers
+Content-Type: application/json
+apikey: alice-key
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+
+
+
+=== TEST 19: authenticated HIT-L1 request - consumer alice
+--- request
+POST /exact-auth
+{"messages":[{"role":"user","content":"Authenticated cache test"}]}
+--- more_headers
+Content-Type: application/json
+apikey: alice-key
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L1
+--- wait: 1
+
+
+
+=== TEST 20: verify consumer label is populated on hits counter
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_hits_total\{route_id="4",service_id="",consumer="alice",layer="l1"\} 1/
+
+
+
+=== TEST 21: verify consumer label is populated on misses counter
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_misses_total\{route_id="4",service_id="",consumer="alice"\} 1/

From 46ad8a65e98c11dc2ce69ae28383f30017b68465 Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Tue, 5 May 2026 02:54:39 +0800
Subject: [PATCH 25/38] fix(ai-cache): populate AI ctx fields on cache hit

---
 apisix/plugins/ai-cache.lua    | 13 +++++++++++++
 t/plugin/prometheus-ai-cache.t |  8 ++++++++
 2 files changed, 21 insertions(+)

diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua
index a095081f9f20..69f24bec9bfb 100644
--- a/apisix/plugins/ai-cache.lua
+++ b/apisix/plugins/ai-cache.lua
@@ -48,6 +48,17 @@ local function layer_enabled(conf, name)
 end
 
 
+local function populate_ai_ctx_on_hit(ctx, protocol_name, body_tab, is_stream, cached_text)
+    ctx.ai_client_protocol = protocol_name
+    ctx.var.request_type = is_stream and "ai_stream" or "ai_chat"
+    if body_tab.model then
+        ctx.var.request_llm_model = body_tab.model
+        ctx.var.llm_model = body_tab.model
+    end
+    ctx.var.llm_response_text = cached_text
+end
+
+
 function _M.check_schema(conf)
     local ok, err = core.schema.check(schema.schema, conf)
     if not ok then
@@ -125,6 +136,7 @@ function _M.access(conf, ctx)
             else
                 core.response.set_header("Content-Type", "application/json")
             end
+            populate_ai_ctx_on_hit(ctx, protocol_name, body_tab, is_stream, cached_text)
             -- TODO: rename build_deny_response to build_response_from_text in a
             -- follow-up. We use it here to wrap cached text in the protocol's
             -- response shape, not for policy denial.
@@ -175,6 +187,7 @@ function _M.access(conf, ctx)
                 else
                     core.response.set_header("Content-Type", "application/json")
                 end
+                populate_ai_ctx_on_hit(ctx, protocol_name, body_tab, is_stream, cached_text)
                 return 200, proto.build_deny_response({
                     stream = is_stream,
                     text = cached_text,
diff --git a/t/plugin/prometheus-ai-cache.t b/t/plugin/prometheus-ai-cache.t
index 02544fe97df7..0863402ea521 100644
--- a/t/plugin/prometheus-ai-cache.t
+++ b/t/plugin/prometheus-ai-cache.t
@@ -471,3 +471,11 @@ qr/apisix_ai_cache_hits_total\{route_id="4",service_id="",consumer="alice",layer
 GET /apisix/prometheus/metrics
 --- response_body_like eval
 qr/apisix_ai_cache_misses_total\{route_id="4",service_id="",consumer="alice"\} 1/
+
+
+
+=== TEST 22: verify cache hit is labelled as ai_chat (not traditional_http)
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_http_status\{code="200",route="1"[^}]*request_type="ai_chat"[^}]*response_source="apisix"[^}]*\} 1/

From 84c6b56b07456b6a4ddc7fe05e53159b91eee054 Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Tue, 5 May 2026 02:57:28 +0800
Subject: [PATCH 26/38] fix(ai-cache): reject empty layers array via schema

---
 apisix/plugins/ai-cache/schema.lua | 1 +
 1 file changed, 1 insertion(+)

diff --git a/apisix/plugins/ai-cache/schema.lua b/apisix/plugins/ai-cache/schema.lua
index 9416af97f638..9bb417183424 100644
--- a/apisix/plugins/ai-cache/schema.lua
+++ b/apisix/plugins/ai-cache/schema.lua
@@ -94,6 +94,7 @@ _M.schema = {
             type = "array",
             items = { type = "string", enum = { "exact", "semantic" } },
             uniqueItems = true,
+            minItems = 1,
             default = { "exact", "semantic" },
         },
         cache_key = {

From 8f9fbfd6adb5366a2a715f0133c145e7303bd9fc Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Tue, 5 May 2026 03:03:49 +0800
Subject: [PATCH 27/38] fix(ai-cache): handle missing/dropped RediSearch index
 gracefully

---
 apisix/plugins/ai-cache/semantic.lua | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/apisix/plugins/ai-cache/semantic.lua b/apisix/plugins/ai-cache/semantic.lua
index 4d4cf1c8dedd..0e7519993cd1 100644
--- a/apisix/plugins/ai-cache/semantic.lua
+++ b/apisix/plugins/ai-cache/semantic.lua
@@ -48,8 +48,13 @@ local function pack_vector(vec)
 end
 
 local index_ready = {}
+local index_unsupported = false
 
 local function ensure_index(red, dim)
+    if index_unsupported then
+        return nil, "RediSearch not supported on this Redis instance"
+    end
+
     if index_ready[dim] then
         return true
     end
@@ -67,8 +72,16 @@ local function ensure_index(red, dim)
         "created_at", "NUMERIC"
     )
 
-    if err and not err:find("already exists") then
-        return nil, "FT.CREATE failed: " .. err
+    if err then
+        -- RediSearch module absent — latch and stop retrying on every request
+        if err:find("unknown command", 1, true)
+           or err:find("ERR unknown", 1, true) then
+            index_unsupported = true
+            return nil, "RediSearch not supported on this Redis instance: " .. err
+        end
+        if not err:find("already exists") then
+            return nil, "FT.CREATE failed: " .. err
+        end
     end
 
     index_ready[dim] = true
@@ -112,6 +125,10 @@ function _M.search(conf, scope_hash, embedding_vec, threshold)
     red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool)
 
     if search_err then
+        -- index was dropped externally — invalidate so next call recreates
+        if search_err:find("Unknown Index name", 1, true) then
+            index_ready[#embedding_vec] = nil
+        end
         return nil, nil, search_err
     end
 

From d2eca3d2d2d5764a89b35a05c8146c8a6f9ecc85 Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Tue, 5 May 2026 03:22:44 +0800
Subject: [PATCH 28/38] test(ai-cache): cover symmetric HIT for
 include_consumer and L2 scope

---
 t/plugin/ai-cache-scope.t | 37 +++++++++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/t/plugin/ai-cache-scope.t b/t/plugin/ai-cache-scope.t
index 97614fe173f8..21facdcf4688 100644
--- a/t/plugin/ai-cache-scope.t
+++ b/t/plugin/ai-cache-scope.t
@@ -257,10 +257,24 @@ X-AI-Fixture: openai/chat-basic.json
 --- error_code: 200
 --- response_headers
 X-AI-Cache-Status: MISS
+--- wait: 1
+
+
+
+=== TEST 9: bob same prompt again - HIT-L1 (proves bob has own cache)
+--- request
+POST /per-consumer
+{"messages":[{"role":"user","content":"per-consumer prompt"}]}
+--- more_headers
+Content-Type: application/json
+apikey: bob-key
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L1
 
 
 
-=== TEST 9: set up route with L2 semantic + cache_key include_vars
+=== TEST 10: set up route with L2 semantic + cache_key include_vars
 --- config
     location /t {
         content_by_lua_block {
@@ -313,7 +327,7 @@ passed
 
 
 
-=== TEST 10: tenant-a first request - MISS, writes to L2 with scope=hash(tenant-a)
+=== TEST 11: tenant-a first request - MISS, writes to L2 with scope=hash(tenant-a)
 --- request
 POST /scoped-semantic
 {"messages":[{"role":"user","content":"What is the capital of France??"}]}
@@ -328,7 +342,7 @@ X-AI-Cache-Status: MISS
 
 
 
-=== TEST 11: tenant-b same prompt - MISS (FT.SEARCH scope filter excludes tenant-a's entry)
+=== TEST 12: tenant-b same prompt - MISS (FT.SEARCH scope filter excludes tenant-a's entry)
 --- request
 POST /scoped-semantic
 {"messages":[{"role":"user","content":"What is the capital of France??"}]}
@@ -339,10 +353,11 @@ X-AI-Fixture: openai/chat-basic.json
 --- error_code: 200
 --- response_headers
 X-AI-Cache-Status: MISS
+--- wait: 1
 
 
 
-=== TEST 12: tenant-a paraphrase - HIT-L2 (scope filter finds tenant-a's entry)
+=== TEST 13: tenant-a paraphrase - HIT-L2 (scope filter finds tenant-a's entry)
 --- request
 POST /scoped-semantic
 {"messages":[{"role":"user","content":"Name the capital city of France"}]}
@@ -353,3 +368,17 @@ X-AI-Fixture: openai/chat-basic.json
 --- error_code: 200
 --- response_headers
 X-AI-Cache-Status: HIT-L2
+
+
+
+=== TEST 14: tenant-b paraphrase - HIT-L2 (proves tenant-b has own L2 entry)
+--- request
+POST /scoped-semantic
+{"messages":[{"role":"user","content":"Name the capital city of France"}]}
+--- more_headers
+Content-Type: application/json
+X-Tenant-Id: tenant-b
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L2

From f6923223aa6e1ea9e6e20933b142c2e925a55ad8 Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Tue, 5 May 2026 05:56:06 +0800
Subject: [PATCH 29/38] test(ai-cache): expand, tighten, and reorganize cache
 plugin tests

---
 t/plugin/ai-cache.t            | 445 ++++++++++++++++-----------------
 t/plugin/prometheus-ai-cache.t |   4 +-
 2 files changed, 211 insertions(+), 238 deletions(-)

diff --git a/t/plugin/ai-cache.t b/t/plugin/ai-cache.t
index 1657fbec3d76..b8e1557c2cca 100644
--- a/t/plugin/ai-cache.t
+++ b/t/plugin/ai-cache.t
@@ -133,6 +133,7 @@ passed
                 layers = { "semantic" },
                 redis_host = "127.0.0.1",
             })
+            
             if not ok then
                 ngx.say("failed: ", err)
             else
@@ -153,15 +154,16 @@ failed: semantic layer requires semantic.embedding to be configured
             local ok, err = plugin.check_schema({
                 layers = { "invalid_layer" },
             })
+
             if not ok then
-                ngx.say("failed")
+                ngx.say(err)
             else
                 ngx.say("passed")
             end
         }
     }
---- response_body
-failed
+--- response_body eval
+qr/.*property "layers" validation failed:.*matches none of the enum values.*/
 
 
 
@@ -182,14 +184,14 @@ failed
             })
 
             if not ok then
-                ngx.say("failed")
+                ngx.say(err)
             else
                 ngx.say("passed")
             end
         }
     }
---- response_body
-failed
+--- response_body eval
+qr/.*property "provider" validation failed: matches none of the enum values.*/
 
 
 
@@ -211,18 +213,40 @@ failed
             })
 
             if not ok then
-                ngx.say("failed")
+                ngx.say(err)
             else
                 ngx.say("passed")
             end
         }
     }
---- response_body
-failed
+--- response_body eval
+qr/.*property "similarity_threshold" validation failed: expected 1\.5 to be at most.*/
 
 
 
-=== TEST 7: set up route for L1 cache tests
+=== TEST 7: layers empty array - should fail (minItems=1)
+--- config
+    location /t {
+        content_by_lua_block {
+            local plugin = require("apisix.plugins.ai-cache")
+            local ok, err = plugin.check_schema({
+                layers = {},
+                redis_host = "127.0.0.1",
+            })
+
+            if not ok then
+                ngx.say(err)
+            else
+                ngx.say("passed")
+            end
+        }
+    }
+--- response_body eval
+qr/.*property "layers" validation failed: expect array to have at least 1 items.*/
+
+
+
+=== TEST 8: set up route for L1 cache tests
 --- config
     location /t {
         content_by_lua_block {
@@ -230,7 +254,7 @@ failed
             local code, body = t('/apisix/admin/routes/1',
                 ngx.HTTP_PUT,
                 [[{
-                    "uri": "/chat",
+                    "uri": "/exact",
                     "plugins": {
                         "ai-proxy": {
                             "provider": "openai",
@@ -264,9 +288,9 @@ passed
 
 
 
-=== TEST 8: first request - cache MISS, upstream called
+=== TEST 9: first request - cache MISS, upstream called
 --- request
-POST /chat
+POST /exact
 {"messages":[{"role":"user","content":"What is the answer to life?"}]}
 --- more_headers
 Content-Type: application/json
@@ -275,13 +299,13 @@ X-AI-Fixture: openai/chat-basic.json
 --- response_headers
 X-AI-Cache-Status: MISS
 --- response_body_like eval
-qr/content/
+qr/\{ "content": "1 \+ 1 = 2\.", "role": "assistant" \}/
 
 
 
-=== TEST 9: second identical request - cache HIT-L1, no upstream call
+=== TEST 10: second identical request - cache HIT-L1, no upstream call
 --- request
-POST /chat
+POST /exact
 {"messages":[{"role":"user","content":"What is the answer to life?"}]}
 --- more_headers
 Content-Type: application/json
@@ -289,16 +313,18 @@ X-AI-Fixture: openai/chat-basic.json
 --- error_code: 200
 --- response_headers
 X-AI-Cache-Status: HIT-L1
+--- response_headers_like
+X-AI-Cache-Age: \d+
 --- response_body_like eval
-qr/content/
+qr/"content":\s?"1 \+ 1 = 2\."/
 --- error_log
 ai-cache: L1 hit for key
 
 
 
-=== TEST 10: bypass header - BYPASS, upstream called, not cached
+=== TEST 11: bypass header - BYPASS, upstream called, not cached
 --- request
-POST /chat
+POST /exact
 {"messages":[{"role":"user","content":"What is the bypass question?"}]}
 --- more_headers
 Content-Type: application/json
@@ -310,9 +336,9 @@ X-AI-Cache-Status: BYPASS
 
 
 
-=== TEST 11: same prompt without bypass after bypass - still MISS (bypass did not cache)
+=== TEST 12: same prompt without bypass after bypass - still MISS (bypass did not cache)
 --- request
-POST /chat
+POST /exact
 {"messages":[{"role":"user","content":"What is the bypass question?"}]}
 --- more_headers
 Content-Type: application/json
@@ -323,7 +349,7 @@ X-AI-Cache-Status: MISS
 
 
 
-=== TEST 12: set up route with two bypass rules
+=== TEST 13: set up route with two bypass rules
 --- config
     location /t {
         content_by_lua_block {
@@ -331,7 +357,7 @@ X-AI-Cache-Status: MISS
             local code, body = t('/apisix/admin/routes/1',
                 ngx.HTTP_PUT,
                 [[{
-                    "uri": "/chat",
+                    "uri": "/exact",
                     "plugins": {
                         "ai-proxy": {
                             "provider": "openai",
@@ -368,9 +394,9 @@ passed
 
 
 
-=== TEST 13: first bypass rule matches - BYPASS
+=== TEST 14: first bypass rule matches - BYPASS
 --- request
-POST /chat
+POST /exact
 {"messages":[{"role":"user","content":"multi-rule bypass test"}]}
 --- more_headers
 Content-Type: application/json
@@ -382,9 +408,9 @@ X-AI-Cache-Status: BYPASS
 
 
 
-=== TEST 14: second bypass rule matches - BYPASS
+=== TEST 15: second bypass rule matches - BYPASS
 --- request
-POST /chat
+POST /exact
 {"messages":[{"role":"user","content":"multi-rule bypass test"}]}
 --- more_headers
 Content-Type: application/json
@@ -396,7 +422,7 @@ X-AI-Cache-Status: BYPASS
 
 
 
-=== TEST 15: set up route for 4xx test
+=== TEST 16: set up route for upstream-status filter tests
 --- config
     location /t {
         content_by_lua_block {
@@ -437,226 +463,190 @@ passed
 
 
 
-=== TEST 16: 4xx from upstream - not cached
+=== TEST 17: non-2xx upstream response - not cached (status code filter)
 --- request
 POST /error
-{"messages":[{"role":"user","content":"trigger an error please"}]}
+{"messages":[{"role":"user","content":"trigger a server error"}]}
 --- more_headers
 Content-Type: application/json
 X-AI-Fixture: openai/chat-basic.json
-X-AI-Fixture-Status: 400
---- error_code: 400
+X-AI-Fixture-Status: 500
+--- error_code: 500
 --- response_headers
 X-AI-Cache-Status: MISS
 
 
 
-=== TEST 17: same prompt after 4xx - still MISS (4xx was not cached)
+=== TEST 18: same prompt after non-2xx - still MISS (was not cached)
 --- request
 POST /error
-{"messages":[{"role":"user","content":"trigger an error please"}]}
+{"messages":[{"role":"user","content":"trigger a server error"}]}
 --- more_headers
 Content-Type: application/json
 X-AI-Fixture: openai/chat-basic.json
-X-AI-Fixture-Status: 400
---- error_code: 400
+X-AI-Fixture-Status: 500
+--- error_code: 500
 --- response_headers
 X-AI-Cache-Status: MISS
 
 
 
-=== TEST 18: openai driver - parses embedding vector correctly
---- http_config
-server {
-    listen 1990;
-    default_type 'application/json';
-
-    location /v1/embeddings {
-        content_by_lua_block {
-            local cjson = require("cjson.safe")
-            ngx.req.read_body()
-            local body = cjson.decode(ngx.req.get_body_data())
-
-            if ngx.req.get_headers()["Authorization"] ~= "Bearer test-key" then
-                ngx.status = 401
-                ngx.say('{"error":"unauthorized"}')
-                return
-            end
-
-            ngx.status = 200
-            ngx.say(cjson.encode({
-                data = {
-                    { embedding = {0.1, 0.2, 0.3}, index = 0, object = "embedding" }
-                },
-                model = body.model,
-                object = "list"
-            }))
-        }
-    }
-}
+=== TEST 19: set up route with very small max_cache_body_size
 --- config
     location /t {
         content_by_lua_block {
-            local http = require("resty.http")
-            local driver = require("apisix.plugins.ai-cache.embeddings.openai")
-
-            local httpc = http.new()
-            local conf = {
-                endpoint = "http://127.0.0.1:1990/v1/embeddings",
-                api_key = "test-key",
-                model = "text-embedding-3-small",
-            }
-
-            local embedding, status, err = driver.get_embeddings(conf, "hello world", httpc, false)
-            if not embedding then
-                ngx.say("error: ", err)
-                return
-            end
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/3',
+                ngx.HTTP_PUT,
+                [[{
+                    "uri": "/tiny",
+                    "plugins": {
+                        "ai-proxy": {
+                            "provider": "openai",
+                            "auth": {
+                                "header": {
+                                    "Authorization": "Bearer test-key"
+                                }
+                            },
+                            "override": {
+                                "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+                            }
+                        },
+                        "ai-cache": {
+                            "layers": ["exact"],
+                            "exact": { "ttl": 60 },
+                            "max_cache_body_size": 5,
+                            "redis_host": "127.0.0.1"
+                        }
+                    }
+                }]]
+            )
 
-            if #embedding ~= 3 then
-                ngx.say("wrong length: ", #embedding)
-                return
+            if code >= 300 then
+                ngx.status = code
             end
-
-            ngx.say("ok: ", embedding[1], " ", embedding[2], " ", embedding[3])
+            ngx.say(body)
         }
     }
 --- response_body
-ok: 0.1 0.2 0.3
+passed
 
 
 
-=== TEST 19: openai driver - 429 from API return nil with status
---- http_config
-server {
-    listen 1990;
-    default_type 'application/json';
-
-    location /v1/embeddings {
-        content_by_lua_block {
-            ngx.status = 429
-            ngx.say('{"error":{"message":"rate limit exceeded","type":"requests"}}')
-        }
-    }
-}
---- config
-    location /t {
-        content_by_lua_block {
-            local http = require("resty.http")
-            local driver = require("apisix.plugins.ai-cache.embeddings.openai")
-
-            local httpc = http.new()
-            local conf = {
-                endpoint = "http://127.0.0.1:1990/v1/embeddings",
-                api_key = "test-key",
-            }
-
-            local embedding, status, err = driver.get_embeddings(conf, "hello", httpc, false)
-            if embedding then
-                ngx.say("unexpected success")
-                return
-            end
-
-            ngx.say("status: ", status)
-        }
-    }
---- response_body
-status: 429
+=== TEST 20: oversize response - MISS, log warns and skips cache write
+--- request
+POST /tiny
+{"messages":[{"role":"user","content":"oversize body test"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- response_body_like eval
+qr/\{ "content": "1 \+ 1 = 2\.", "role": "assistant" \}/
+--- error_log
+exceeds max_cache_body_size
 
 
 
-=== TEST 20: azure_openai driver - parses embedding vector correctly
---- http_config
-server {
-    listen 1990;
-    default_type 'application/json';
+=== TEST 21: same prompt after oversize - still MISS (was not cached)
+--- request
+POST /tiny
+{"messages":[{"role":"user","content":"oversize body test"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- response_body_like eval
+qr/\{ "content": "1 \+ 1 = 2\.", "role": "assistant" \}/
+--- error_log
+exceeds max_cache_body_size
 
-    location /embeddings {
-        content_by_lua_block {
-            local cjson = require("cjson.safe")
 
-            if ngx.req.get_headers()["api-key"] ~= "azure-test-key" then
-                ngx.status = 401
-                ngx.say('{"error":"unauthorized"}')
-                return
-            end
 
-            ngx.status = 200
-            ngx.say(cjson.encode({
-                data = {
-                    { embedding = {0.4, 0.5, 0.6}, index = 0, object = "embedding" }
-                },
-                object = "list"
-            }))
-        }
-    }
-}
+=== TEST 22: set up route with custom cache header names
 --- config
     location /t {
         content_by_lua_block {
-            local http = require("resty.http")
-            local driver = require("apisix.plugins.ai-cache.embeddings.azure_openai")
-
-            local httpc = http.new()
-            local conf = {
-                endpoint = "http://127.0.0.1:1990/embeddings",
-                api_key = "azure-test-key",
-            }
-
-            local embedding, status, err = driver.get_embeddings(conf, "hello world", httpc, false)
-            if not embedding then
-                ngx.say("error: ", err)
-                return
-            end
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/4',
+                ngx.HTTP_PUT,
+                [[{
+                    "uri": "/custom-headers",
+                    "plugins": {
+                        "ai-proxy": {
+                            "provider": "openai",
+                            "auth": {
+                                "header": {
+                                    "Authorization": "Bearer test-key"
+                                }
+                            },
+                            "override": {
+                                "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+                            }
+                        },
+                        "ai-cache": {
+                            "layers": ["exact"],
+                            "exact": { "ttl": 60 },
+                            "headers": {
+                                "cache_status": "X-Custom-Status",
+                                "cache_age":    "X-Custom-Age"
+                            },
+                            "redis_host": "127.0.0.1"
+                        }
+                    }
+                }]]
+            )
 
-            ngx.say("ok: ", embedding[1], " ", embedding[2], " ", embedding[3])
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.say(body)
         }
     }
 --- response_body
-ok: 0.4 0.5 0.6
+passed
 
 
 
-=== TEST 21: openai driver - 500 from API returns nil with status
---- http_config
-server {
-    listen 1990;
-    default_type 'application/json';
+=== TEST 23: MISS populates the cache and emits custom status header
+--- request
+POST /custom-headers
+{"messages":[{"role":"user","content":"custom header test"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-Custom-Status: MISS
+--- response_body_like eval
+qr/\{ "content": "1 \+ 1 = 2\.", "role": "assistant" \}/
+--- wait: 1
 
-    location /v1/embeddings {
-        content_by_lua_block {
-            ngx.status = 500
-            ngx.say('{"error":{"message":"internal server error"}}')
-        }
-    }
-}
---- config
-    location /t {
-        content_by_lua_block {
-            local http = require("resty.http")
-            local driver = require("apisix.plugins.ai-cache.embeddings.openai")
-
-            local httpc = http.new()
-            local conf = {
-                endpoint = "http://127.0.0.1:1990/v1/embeddings",
-                api_key = "test-key",
-            }
-
-            local embedding, status, err = driver.get_embeddings(conf, "hello", httpc, false)
-            if embedding then
-                ngx.say("unexpected success")
-                return
-            end
 
-            ngx.say("status: ", status)
-        }
-    }
---- response_body
-status: 500
 
+=== TEST 24: HIT emits custom status and age headers (defaults not used)
+--- request
+POST /custom-headers
+{"messages":[{"role":"user","content":"custom header test"}]}
+--- more_headers
+Content-Type: application/json
+--- error_code: 200
+--- response_headers
+X-Custom-Status: HIT-L1
+X-AI-Cache-Status:
+X-AI-Cache-Age:
+--- response_headers_like
+X-Custom-Age: \d+
+--- response_body_like eval
+qr/"content":\s?"1 \+ 1 = 2\."/
 
 
-=== TEST 22: clean up L2 state before semantic tests
+
+=== TEST 25: clean up Redis cache state before semantic tests
 --- config
     location /t {
         content_by_lua_block {
@@ -665,7 +655,6 @@ status: 500
             red:set_timeout(1000)
             assert(red:connect("127.0.0.1", 6379))
 
-            red["FT.DROPINDEX"](red, "ai-cache-idx", "DD")
             red["FT.DROPINDEX"](red, "ai-cache-idx-3", "DD")
 
             local keys = red:keys("ai-cache:*")
@@ -682,12 +671,12 @@ ok
 
 
 
-=== TEST 23: set up route for L2 semantic cache tests
+=== TEST 26: set up route for L2 semantic cache tests
 --- config
     location /t {
         content_by_lua_block {
             local t = require("lib.test_admin").test
-            local code, body = t('/apisix/admin/routes/3',
+            local code, body = t('/apisix/admin/routes/5',
                 ngx.HTTP_PUT,
                 [[{
                     "uri": "/semantic",
@@ -733,7 +722,7 @@ passed
 
 
 
-=== TEST 24: L2 - first request, cache MISS, stored in L2
+=== TEST 27: L2 - first request, cache MISS, stored in L2
 --- request
 POST /semantic
 {"messages":[{"role":"user","content":"What is the capital of France??"}]}
@@ -743,10 +732,12 @@ X-AI-Fixture: openai/chat-basic.json
 --- error_code: 200
 --- response_headers
 X-AI-Cache-Status: MISS
+--- response_body_like eval
+qr/\{ "content": "1 \+ 1 = 2\.", "role": "assistant" \}/
 
 
 
-=== TEST 25: L2 - different wording hits L2 (same vector from fixture)
+=== TEST 28: L2 - different wording hits L2 (same vector from fixture)
 --- request
 POST /semantic
 {"messages":[{"role":"user","content":"Name the capital city of France"}]}
@@ -756,55 +747,35 @@ X-AI-Fixture: openai/chat-basic.json
 --- error_code: 200
 --- response_headers
 X-AI-Cache-Status: HIT-L2
+--- response_headers_like
+X-AI-Cache-Similarity: \d+(\.\d+)?
 --- response_body_like eval
-qr/content/
+qr/"content":\s?"1 \+ 1 = 2\."/
 --- error_log
 ai-cache: L2 hit
 
 
 
-=== TEST 26: L2 - original prompt now hits L1 (backfilled by the L2 hit)
+=== TEST 29: L2 - paraphrase now hits L1 (backfilled by the previous L2 hit)
 --- request
 POST /semantic
-{"messages":[{"role":"user","content":"What is the capital of France??"}]}
+{"messages":[{"role":"user","content":"Name the capital city of France"}]}
 --- more_headers
 Content-Type: application/json
 X-AI-Fixture: openai/chat-basic.json
 --- error_code: 200
 --- response_headers
 X-AI-Cache-Status: HIT-L1
+--- response_body_like eval
+qr/"content":\s?"1 \+ 1 = 2\."/
 --- error_log
 ai-cache: L1 hit for key
 
 
 
-=== TEST 27: L2 degradation - search error results in MISS, not 500
---- config
-    location /t {
-        content_by_lua_block {
-            local semantic = require("apisix.plugins.ai-cache.semantic")
-            local conf = {
-                redis_host = "127.0.0.1",
-                redis_port = 6379,
-                redis_timeout = 100,
-            }
-
-            local text, sim, err = semantic.search(conf, "", {0.1, 0.2, 0.3}, 0.95)
-            if err then
-                ngx.say("degraded gracefully")
-            else
-                ngx.say("miss, no error")
-            end
-        }
-    }
---- response_body_like eval
-qr/degraded gracefully|miss, no error/
-
-
-
-=== TEST 28: streaming MISS - upstream called, response cached via log phase
+=== TEST 30: streaming MISS - upstream called, response cached via log phase
 --- request
-POST /chat
+POST /exact
 {"messages":[{"role":"user","content":"Stream me something cool"}],"stream":true}
 --- more_headers
 Content-Type: application/json
@@ -812,12 +783,14 @@ X-AI-Fixture: openai/chat-streaming.sse
 --- error_code: 200
 --- response_headers
 X-AI-Cache-Status: MISS
+--- response_body_like eval
+qr/data:.*"content":"Hello"/
 
 
 
-=== TEST 29: streaming HIT - Content-Type is text/event-stream, SSE body returned
+=== TEST 31: streaming HIT - Content-Type is text/event-stream, SSE body returned
 --- request
-POST /chat
+POST /exact
 {"messages":[{"role":"user","content":"Stream me something cool"}],"stream":true}
 --- more_headers
 Content-Type: application/json
@@ -826,14 +799,14 @@ Content-Type: application/json
 X-AI-Cache-Status: HIT-L1
 Content-Type: text/event-stream
 --- response_body_like eval
-qr/data:.*content/
+qr/data:.*"content":\s?"Hello!"/
 --- wait: 1
 
 
 
-=== TEST 30: non-streaming HIT after streaming MISS - returns JSON
+=== TEST 32: non-streaming HIT after streaming MISS - returns JSON
 --- request
-POST /chat
+POST /exact
 {"messages":[{"role":"user","content":"Stream me something cool"}]}
 --- more_headers
 Content-Type: application/json
@@ -842,4 +815,4 @@ Content-Type: application/json
 X-AI-Cache-Status: HIT-L1
 Content-Type: application/json
 --- response_body_like eval
-qr/content/
+qr/"content":\s?"Hello!"/
diff --git a/t/plugin/prometheus-ai-cache.t b/t/plugin/prometheus-ai-cache.t
index 0863402ea521..3af1a6ae2491 100644
--- a/t/plugin/prometheus-ai-cache.t
+++ b/t/plugin/prometheus-ai-cache.t
@@ -323,7 +323,7 @@ qr/apisix_ai_cache_hits_total\{route_id="1",service_id="",consumer="",layer="l1"
 
 
 
-=== TEST 9: cleanup Redis L2 state before semantic tests
+=== TEST 9: cleanup Redis cache state before semantic tests
 --- config
     location /t {
         content_by_lua_block {
@@ -332,7 +332,7 @@ qr/apisix_ai_cache_hits_total\{route_id="1",service_id="",consumer="",layer="l1"
             red:set_timeout(1000)
             assert(red:connect("127.0.0.1", 6379))
 
-            red["FT.DROPINDEX"](red, "ai-cache-idx", "DD")
+            red["FT.DROPINDEX"](red, "ai-cache-idx-3", "DD")
 
             local keys = red:keys("ai-cache:*")
             if type(keys) == "table" and #keys > 0 then

From 439b9f03ce988dfa23a6d10a37d845bc87855d7c Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Tue, 5 May 2026 06:43:35 +0800
Subject: [PATCH 30/38] fix(ai-cache): expose embedding ssl_verify, document
 model + timeout

---
 apisix/plugins/ai-cache.lua                         | 6 ++++--
 apisix/plugins/ai-cache/embeddings/azure_openai.lua | 6 ++++++
 apisix/plugins/ai-cache/embeddings/openai.lua       | 6 ++++++
 apisix/plugins/ai-cache/schema.lua                  | 8 ++++++++
 docs/en/latest/plugins/ai-cache.md                  | 4 +++-
 5 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua
index 69f24bec9bfb..cf914ab5d59b 100644
--- a/apisix/plugins/ai-cache.lua
+++ b/apisix/plugins/ai-cache.lua
@@ -154,7 +154,9 @@ function _M.access(conf, ctx)
         local httpc = http.new()
 
         local t0 = ngx_now()
-        local embedding, _, emb_err = emb_driver.get_embeddings(emb_conf, prompt_text, httpc, true)
+        local embedding, _, emb_err = emb_driver.get_embeddings(
+            emb_conf, prompt_text, httpc, emb_conf.ssl_verify
+        )
         if not embedding then
             core.log.warn("ai-cache: embedding fetch failed (degrading to MISS): ", emb_err)
             ctx.ai_cache_embedding_failed = true
@@ -285,7 +287,7 @@ function _M.log(conf, ctx)
                 )
                 local httpc = http.new()
                 local emb, _, emb_err = emb_driver.get_embeddings(
-                    emb_conf, prompt_text, httpc, true
+                    emb_conf, prompt_text, httpc, emb_conf.ssl_verify
                 )
                 if not emb then
                     ngx.log(ngx.WARN,
diff --git a/apisix/plugins/ai-cache/embeddings/azure_openai.lua b/apisix/plugins/ai-cache/embeddings/azure_openai.lua
index 928a803f948e..6f862ea78cc8 100644
--- a/apisix/plugins/ai-cache/embeddings/azure_openai.lua
+++ b/apisix/plugins/ai-cache/embeddings/azure_openai.lua
@@ -31,6 +31,8 @@ function _M.get_embeddings(conf, text, httpc, ssl_verify)
         return nil, HTTP_INTERNAL_SERVER_ERROR, err
     end
 
+    httpc:set_timeout(conf.timeout)
+
     local res, err = httpc:request_uri(conf.endpoint, {
         method = "POST",
         headers = {
@@ -39,6 +41,7 @@ function _M.get_embeddings(conf, text, httpc, ssl_verify)
         },
         body = body,
         ssl_verify = ssl_verify,
+        keepalive = true,
     })
 
     if not res or not res.body then
@@ -62,6 +65,9 @@ function _M.get_embeddings(conf, text, httpc, ssl_verify)
     if type(embedding) ~= "table" then
         return nil, HTTP_INTERNAL_SERVER_ERROR, "missing embedding field in response"
     end
+    if #embedding == 0 then
+        return nil, HTTP_INTERNAL_SERVER_ERROR, "embedding vector is empty"
+    end
 
     return embedding, nil, nil
 end
diff --git a/apisix/plugins/ai-cache/embeddings/openai.lua b/apisix/plugins/ai-cache/embeddings/openai.lua
index 0ca8c7cd61af..740b12d23f2d 100644
--- a/apisix/plugins/ai-cache/embeddings/openai.lua
+++ b/apisix/plugins/ai-cache/embeddings/openai.lua
@@ -34,6 +34,8 @@ function _M.get_embeddings(conf, text, httpc, ssl_verify)
         return nil, HTTP_INTERNAL_SERVER_ERROR, err
     end
 
+    httpc:set_timeout(conf.timeout)
+
     local res, err = httpc:request_uri(conf.endpoint, {
         method = "POST",
         headers = {
@@ -42,6 +44,7 @@ function _M.get_embeddings(conf, text, httpc, ssl_verify)
         },
         body = body,
         ssl_verify = ssl_verify,
+        keepalive = true,
     })
 
     if not res or not res.body then
@@ -65,6 +68,9 @@ function _M.get_embeddings(conf, text, httpc, ssl_verify)
     if type(embedding) ~= "table" then
         return nil, HTTP_INTERNAL_SERVER_ERROR, "missing embedding field in response"
     end
+    if #embedding == 0 then
+        return nil, HTTP_INTERNAL_SERVER_ERROR, "embedding vector is empty"
+    end
 
     return embedding, nil, nil
 end
diff --git a/apisix/plugins/ai-cache/schema.lua b/apisix/plugins/ai-cache/schema.lua
index 9bb417183424..57e4b9892045 100644
--- a/apisix/plugins/ai-cache/schema.lua
+++ b/apisix/plugins/ai-cache/schema.lua
@@ -29,6 +29,14 @@ local embedding_schema = {
         model = { type = "string" },
         endpoint = { type = "string" },
         api_key = { type = "string" },
+        timeout = {
+            type = "integer",
+            minimum = 1,
+            maximum = 600000,
+            default = 5000,
+            description = "timeout in milliseconds",
+        },
+        ssl_verify = { type = "boolean", default = true },
     },
     required = { "provider", "endpoint", "api_key" },
 }
diff --git a/docs/en/latest/plugins/ai-cache.md b/docs/en/latest/plugins/ai-cache.md
index 3c1789131134..f41559260c70 100644
--- a/docs/en/latest/plugins/ai-cache.md
+++ b/docs/en/latest/plugins/ai-cache.md
@@ -52,7 +52,9 @@ The Plugin should be used together with [ai-proxy](./ai-proxy.md) or [ai-proxy-m
 | `semantic.embedding.provider` | string | True (if semantic enabled) | | `"openai"`, `"azure_openai"` | Embedding API provider. |
 | `semantic.embedding.endpoint` | string | True (if semantic enabled) | | | Embedding API endpoint URL. |
 | `semantic.embedding.api_key` | string | True (if semantic enabled) | | | API key for the embedding provider. Stored encrypted. |
-| `semantic.embedding.model` | string | False | | | Embedding model name. Uses provider default if omitted. |
+| `semantic.embedding.model` | string | False | | | Embedding model name. Sent in the request body for `provider: openai`; ignored for `provider: azure_openai` (Azure infers the model from the deployment URL). Uses provider default if omitted. |
+| `semantic.embedding.timeout` | integer | False | `5000` | [1, 600000] | HTTP request timeout in milliseconds for embedding API calls. |
+| `semantic.embedding.ssl_verify` | boolean | False | `true` | | Whether to verify the embedding endpoint's TLS certificate. |
 | `cache_key.include_consumer` | boolean | False | `false` | | If `true`, partition the cache by consumer name. |
 | `cache_key.include_vars` | array[string] | False | `[]` | | Additional `ctx.var` names included in the cache key, for example `["$http_x_tenant_id"]`. |
 | `bypass_on` | array[object] | False | | | List of `{header, equals}` rules. If any matches, the request bypasses the cache. |

From 61530f85968c1c013f1b2849534aeac1bceac591 Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Tue, 5 May 2026 06:51:51 +0800
Subject: [PATCH 31/38] docs(ai-cache): add field descriptions to schema; cap
 top_k at 100

---
 apisix/plugins/ai-cache/schema.lua | 75 +++++++++++++++++++++++++-----
 docs/en/latest/plugins/ai-cache.md |  2 +-
 2 files changed, 65 insertions(+), 12 deletions(-)

diff --git a/apisix/plugins/ai-cache/schema.lua b/apisix/plugins/ai-cache/schema.lua
index 57e4b9892045..02587f7fb14c 100644
--- a/apisix/plugins/ai-cache/schema.lua
+++ b/apisix/plugins/ai-cache/schema.lua
@@ -25,18 +25,34 @@ local embedding_schema = {
         provider = {
             type = "string",
             enum = { "openai", "azure_openai" },
+            description = "Embedding API provider.",
+        },
+        model = {
+            type = "string",
+            description = "Embedding model name. Sent in the request body for "
+                       .. "provider: openai; ignored for provider: azure_openai "
+                       .. "(Azure infers the model from the deployment URL).",
+        },
+        endpoint = {
+            type = "string",
+            description = "Embedding API endpoint URL.",
+        },
+        api_key = {
+            type = "string",
+            description = "API key for the embedding provider.",
         },
-        model = { type = "string" },
-        endpoint = { type = "string" },
-        api_key = { type = "string" },
         timeout = {
             type = "integer",
             minimum = 1,
             maximum = 600000,
             default = 5000,
-            description = "timeout in milliseconds",
+            description = "HTTP request timeout in milliseconds for embedding API calls.",
+        },
+        ssl_verify = {
+            type = "boolean",
+            default = true,
+            description = "Whether to verify the embedding endpoint's TLS certificate.",
         },
-        ssl_verify = { type = "boolean", default = true },
     },
     required = { "provider", "endpoint", "api_key" },
 }
@@ -49,16 +65,21 @@ local semantic_schema = {
             minimum = 0,
             maximum = 1,
             default = 0.95,
+            description = "Minimum cosine similarity required for a semantic-layer hit.",
         },
         top_k = {
             type = "integer",
             minimum = 1,
+            maximum = 100,
             default = 1,
+            description = "Number of nearest-neighbor candidates the index returns; "
+                       .. "the first candidate above similarity_threshold is used.",
         },
         ttl = {
             type = "integer",
             minimum = 1,
             default = 86400,
+            description = "Time-to-live in seconds for semantic-layer entries.",
         },
         embedding = embedding_schema,
     },
@@ -72,6 +93,7 @@ local exact_schema = {
             type = "integer",
             minimum = 1,
             default = 3600,
+            description = "Time-to-live in seconds for exact-layer entries.",
         },
     },
 }
@@ -80,8 +102,15 @@ local exact_schema = {
 local bypass_item_schema = {
     type = "object",
     properties = {
-        header = { type = "string" },
-        equals = { type = "string" },
+        header = {
+            type = "string",
+            description = "Request header name to inspect.",
+        },
+        equals = {
+            type = "string",
+            description = "Value to match against the header. "
+                       .. "If equal, the request bypasses the cache.",
+        },
     },
     required = { "header", "equals" },
 }
@@ -89,9 +118,22 @@ local bypass_item_schema = {
 local headers_schema = {
     type = "object",
     properties = {
-        cache_status = { type = "string", default = "X-AI-Cache-Status" },
-        cache_similarity = { type = "string", default = "X-AI-Cache-Similarity" },
-        cache_age = { type = "string", default = "X-AI-Cache-Age" },
+        cache_status = {
+            type = "string",
+            default = "X-AI-Cache-Status",
+            description = "Response header name for cache status "
+                       .. "(HIT-L1 / HIT-L2 / MISS / BYPASS).",
+        },
+        cache_similarity = {
+            type = "string",
+            default = "X-AI-Cache-Similarity",
+            description = "Response header name for the similarity score of a semantic-layer hit.",
+        },
+        cache_age = {
+            type = "string",
+            default = "X-AI-Cache-Age",
+            description = "Response header name for the age in seconds of an exact-layer hit.",
+        },
     },
 }
 
@@ -104,15 +146,22 @@ _M.schema = {
             uniqueItems = true,
             minItems = 1,
             default = { "exact", "semantic" },
+            description = "Cache layers to enable, queried in order.",
         },
         cache_key = {
             type = "object",
             properties = {
-                include_consumer = {type = "boolean", default = false },
+                include_consumer = {
+                    type = "boolean",
+                    default = false,
+                    description = "If true, partition the cache by consumer name.",
+                },
                 include_vars = {
                     type = "array",
                     items = { type = "string" },
                     default = {},
+                    description = "Additional ctx.var names included in the cache key, "
+                               .. "for example [\"$http_x_tenant_id\"].",
                 },
             },
         },
@@ -121,12 +170,16 @@ _M.schema = {
         bypass_on = {
             type = "array",
             items = bypass_item_schema,
+            description = "List of {header, equals} rules. "
+                       .. "If any matches, the request bypasses the cache.",
         },
         headers = headers_schema,
         max_cache_body_size = {
             type = "integer",
             minimum = 1,
             default = 1048576,
+            description = "Maximum response size in bytes to write to cache. "
+                       .. "Larger responses pass through but are not cached.",
         },
     },
     allOf = { redis_schema.schema.redis },
diff --git a/docs/en/latest/plugins/ai-cache.md b/docs/en/latest/plugins/ai-cache.md
index f41559260c70..cec70ae3b675 100644
--- a/docs/en/latest/plugins/ai-cache.md
+++ b/docs/en/latest/plugins/ai-cache.md
@@ -47,7 +47,7 @@ The Plugin should be used together with [ai-proxy](./ai-proxy.md) or [ai-proxy-m
 | `layers` | array[string] | False | `["exact", "semantic"]` | `"exact"`, `"semantic"` | Cache layers to enable, queried in order. |
 | `exact.ttl` | integer | False | `3600` | ≥ 1 | Time-to-live in seconds for exact-layer entries. |
 | `semantic.similarity_threshold` | number | False | `0.95` | 0–1 | Minimum cosine similarity required for a semantic-layer hit. |
-| `semantic.top_k` | integer | False | `1` | ≥ 1 | Number of nearest-neighbor candidates the index returns; the first candidate above `similarity_threshold` is used. |
+| `semantic.top_k` | integer | False | `1` | [1, 100] | Number of nearest-neighbor candidates the index returns; the first candidate above `similarity_threshold` is used. |
 | `semantic.ttl` | integer | False | `86400` | ≥ 1 | Time-to-live in seconds for semantic-layer entries. |
 | `semantic.embedding.provider` | string | True (if semantic enabled) | | `"openai"`, `"azure_openai"` | Embedding API provider. |
 | `semantic.embedding.endpoint` | string | True (if semantic enabled) | | | Embedding API endpoint URL. |

From a1de7514eba6f7fec7f524b0f85b4cf76784c320 Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Tue, 5 May 2026 07:05:00 +0800
Subject: [PATCH 32/38] refactor(ai-cache): tighten exact.lua redis pooling and
 prune dead code

---
 apisix/plugins/ai-cache.lua       |  7 +------
 apisix/plugins/ai-cache/exact.lua | 26 ++++++++++++++------------
 2 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua
index cf914ab5d59b..026a2ee45b1f 100644
--- a/apisix/plugins/ai-cache.lua
+++ b/apisix/plugins/ai-cache.lua
@@ -113,12 +113,7 @@ function _M.access(conf, ctx)
 
     local prompt_text = table_concat(contents, " ")
     local scope_hash = exact.compute_scope_hash(conf, ctx)
-    local prompt_hash, hash_err = exact.compute_prompt_hash(prompt_text)
-    if not prompt_hash then
-        core.log.warn("ai-cache: failed to compute prompt hash: ", hash_err)
-        ctx.ai_cache_status = "MISS"
-        return
-    end
+    local prompt_hash = exact.compute_prompt_hash(prompt_text)
 
     local is_stream = body_tab.stream == true
 
diff --git a/apisix/plugins/ai-cache/exact.lua b/apisix/plugins/ai-cache/exact.lua
index d442cd89c467..52f3f98f6f56 100644
--- a/apisix/plugins/ai-cache/exact.lua
+++ b/apisix/plugins/ai-cache/exact.lua
@@ -37,7 +37,6 @@ local function sha256_hex(s)
     return to_hex(hash:final())
 end
 
-_M.sha256_hex = sha256_hex
 
 function _M.compute_scope_hash(conf, ctx)
     local cache_key = conf.cache_key
@@ -73,7 +72,7 @@ end
 
 
 function _M.compute_prompt_hash(text)
-    return sha256_hex(text), nil
+    return sha256_hex(text)
 end
 
 
@@ -84,13 +83,14 @@ function _M.get(conf, scope_hash, prompt_hash)
     end
 
     local key = KEY_PREFIX .. scope_hash .. ":" .. prompt_hash
-    local res, err = red:get(key)
-    red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool)
-
-    if err then
-        return nil, nil, err
+    local res, get_err = red:get(key)
+    if get_err then
+        red:close()
+        return nil, nil, get_err
     end
 
+    red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool)
+
     if res == ngx.null then
         return nil, nil, nil
     end
@@ -117,16 +117,18 @@ function _M.set(conf, scope_hash, prompt_hash, text, ttl)
     })
 
     if not entry then
-        red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool)
+        red:close()
         return encode_err
     end
 
-    local ok, err = red:set(key, entry, "EX", ttl)
-    red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool)
-
+    local ok, set_err = red:set(key, entry, "EX", ttl)
     if not ok then
-        return err
+        red:close()
+        return set_err
     end
+
+    red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool)
+    
     return nil
 end
 

From 7df710ead32152131bfb3206f29d68d7580b09d5 Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Tue, 5 May 2026 07:17:38 +0800
Subject: [PATCH 33/38] fix(ai-cache): tighten semantic.lua redis pooling and
 atomicize writes

---
 apisix/plugins/ai-cache/semantic.lua | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/apisix/plugins/ai-cache/semantic.lua b/apisix/plugins/ai-cache/semantic.lua
index 0e7519993cd1..6d84dbd28675 100644
--- a/apisix/plugins/ai-cache/semantic.lua
+++ b/apisix/plugins/ai-cache/semantic.lua
@@ -97,7 +97,7 @@ function _M.search(conf, scope_hash, embedding_vec, threshold)
 
     local ok, init_err = ensure_index(red, #embedding_vec)
     if not ok then
-        red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool)
+        red:close()
         return nil, nil, init_err
     end
 
@@ -122,9 +122,9 @@ function _M.search(conf, scope_hash, embedding_vec, threshold)
         "RETURN", "2", "response", "dist",
         "DIALECT", "2"
     )
-    red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool)
 
     if search_err then
+        red:close()
         -- index was dropped externally — invalidate so next call recreates
         if search_err:find("Unknown Index name", 1, true) then
             index_ready[#embedding_vec] = nil
@@ -132,6 +132,8 @@ function _M.search(conf, scope_hash, embedding_vec, threshold)
         return nil, nil, search_err
     end
 
+    red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool)
+
     if not res or res[1] == 0 then
         return nil, nil, nil
     end
@@ -173,26 +175,35 @@ function _M.store(conf, scope_hash, embedding_vec, text, ttl)
 
     local ok, init_err = ensure_index(red, #embedding_vec)
     if not ok then
-        red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool)
+        red:close()
         return init_err
     end
 
     local binary_vec = pack_vector(embedding_vec)
     local key = key_prefix(#embedding_vec) .. uuid.generate_v4()
 
-    local set_ok, set_err = red:hset(key,
+    -- HSET + EXPIRE wrapped in MULTI/EXEC so the entry is never written
+    -- without its TTL (which would orphan it in Redis forever).
+    local _, multi_err = red:multi()
+    if multi_err then
+        red:close()
+        return multi_err
+    end
+
+    red:hset(key,
         "embedding", binary_vec,
         "response", text,
         "scope", scope_hash,
         "created_at", tostring(ngx_time())
     )
+    red:expire(key, ttl)
 
-    if not set_ok then
-        red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool)
-        return set_err
+    local results, exec_err = red:exec()
+    if not results then
+        red:close()
+        return exec_err
     end
 
-    red:expire(key, ttl)
     red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool)
     return nil
 end

From bf250acfc2472ec9eca64fc61636801fa64dc5f8 Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Tue, 5 May 2026 07:50:10 +0800
Subject: [PATCH 34/38] refactor(ai-cache): drop log-phase embedding re-fetch

---
 apisix/plugins/ai-cache.lua | 45 ++++++++++++++-----------------------
 1 file changed, 17 insertions(+), 28 deletions(-)

diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua
index 026a2ee45b1f..476bd785d85a 100644
--- a/apisix/plugins/ai-cache.lua
+++ b/apisix/plugins/ai-cache.lua
@@ -83,7 +83,6 @@ function _M.access(conf, ctx)
         local req_headers = ngx.req.get_headers()
         for _, rule in ipairs(conf.bypass_on) do
             if req_headers[rule.header] == rule.equals then
-                ctx.ai_cache_bypass = true
                 ctx.ai_cache_status = "BYPASS"
                 return
             end
@@ -170,11 +169,14 @@ function _M.access(conf, ctx)
             elseif cached_text then
                 core.log.info("ai-cache: L2 hit, similarity=", similarity)
 
-                local l1_ttl = (conf.exact and conf.exact.ttl) or 3600
-                local l1_err = exact.set(conf, scope_hash, prompt_hash, cached_text, l1_ttl)
-
-                if l1_err then
-                    core.log.warn("ai-cache: L2->L1 backfill failed: ", l1_err)
+                if layer_enabled(conf, "exact") then
+                    local l1_ttl = (conf.exact and conf.exact.ttl) or 3600
+                    local l1_err = exact.set(
+                        conf, scope_hash, prompt_hash, cached_text, l1_ttl
+                    )
+                    if l1_err then
+                        core.log.warn("ai-cache: L2->L1 backfill failed: ", l1_err)
+                    end
                 end
 
                 ctx.ai_cache_status = "HIT-L2"
@@ -228,7 +230,7 @@ function _M.log(conf, ctx)
         return
     end
 
-    -- Early-MISS paths (body parse / protocol detect / hash failure) skip
+    -- Early-MISS paths (body parse / protocol detect / empty content) skip
     -- key computation, so bail out if cache key fields are absent.
     if not ctx.ai_cache_prompt_hash or not ctx.ai_cache_prompt_text then
         return
@@ -258,9 +260,8 @@ function _M.log(conf, ctx)
     local scope_hash = ctx.ai_cache_scope_hash
     local prompt_hash = ctx.ai_cache_prompt_hash
     local embedding = ctx.ai_cache_embedding
-    local prompt_text = ctx.ai_cache_prompt_text
 
-    ngx.timer.at(0, function(premature)
+    local ok, timer_err = ngx.timer.at(0, function(premature)
         if premature then
             return
         end
@@ -268,39 +269,27 @@ function _M.log(conf, ctx)
         if exact_enabled then
             local err = exact.set(conf, scope_hash, prompt_hash, response_text, ttl_exact)
             if err then
-                ngx.log(ngx.ERR, "ai-cache: failed to write L1 cache: ", err)
+                ngx.log(ngx.WARN, "ai-cache: failed to write L1 cache: ", err)
             end
         end
 
         if semantic_enabled then
-            local vec = embedding
-
-            if not vec then
-                local emb_conf = conf.semantic.embedding
-                local emb_driver = require(
-                    "apisix.plugins.ai-cache.embeddings." .. emb_conf.provider
-                )
-                local httpc = http.new()
-                local emb, _, emb_err = emb_driver.get_embeddings(
-                    emb_conf, prompt_text, httpc, emb_conf.ssl_verify
-                )
-                if not emb then
-                    ngx.log(ngx.WARN,
-                        "ai-cache: failed to get embedding for L2 store: ", emb_err)
-                    return
-                end
-                vec = emb
+            if not embedding then
+                return
             end
 
             local ttl_semantic = (conf.semantic and conf.semantic.ttl) or 86400
             local store_err = semantic.store(
-                conf, scope_hash, vec, response_text, ttl_semantic
+                conf, scope_hash, embedding, response_text, ttl_semantic
             )
             if store_err then
                 ngx.log(ngx.WARN, "ai-cache: failed to write L2 cache: ", store_err)
             end
         end
     end)
+    if not ok then
+        core.log.warn("ai-cache: failed to schedule cache write: ", timer_err)
+    end
 end
 
 

From 4796a1d1485bef1227c5365f2d881b56534d7a34 Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Tue, 5 May 2026 07:53:54 +0800
Subject: [PATCH 35/38] chore(ai-cache): fix lint

---
 apisix/plugins/ai-cache/exact.lua | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apisix/plugins/ai-cache/exact.lua b/apisix/plugins/ai-cache/exact.lua
index 52f3f98f6f56..e1a63f9a5f41 100644
--- a/apisix/plugins/ai-cache/exact.lua
+++ b/apisix/plugins/ai-cache/exact.lua
@@ -128,7 +128,7 @@ function _M.set(conf, scope_hash, prompt_hash, text, ttl)
     end
 
     red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool)
-    
+
     return nil
 end
 

From 69e8c8e0b5f1ea1480e7cc80c229dfaa4180dfbb Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Tue, 5 May 2026 08:00:47 +0800
Subject: [PATCH 36/38] docs(ai-cache): remove outdated caveats

---
 docs/en/latest/plugins/ai-cache.md | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/docs/en/latest/plugins/ai-cache.md b/docs/en/latest/plugins/ai-cache.md
index cec70ae3b675..46a446afd0bb 100644
--- a/docs/en/latest/plugins/ai-cache.md
+++ b/docs/en/latest/plugins/ai-cache.md
@@ -1073,16 +1073,8 @@ Two prompts that look semantically equivalent to a human can score below the con
 
 Lower the threshold to catch more paraphrases at the cost of occasionally serving a cached answer for a genuinely different question. Tune empirically against your traffic.
 
-### Switching embedding models is safe
-
-The Plugin namespaces the L2 index and entries by embedding dimension (for example `1536` for `text-embedding-3-small`, `3072` for `text-embedding-3-large`), so changing the embedding model on a live route does not require any manual cleanup. A new index is created automatically for the new dimension; old entries from the previous model expire via the configured `semantic.ttl`.
-
 ### `BYPASS` does not refresh the cache
 
 A request with the bypass header reaches the upstream but its response is not written back. Use it to force a fresh upstream call without invalidating or replacing the existing cached entry.
 
 The bypass header is not authenticated — any client that can set the configured header and value can bypass the cache. In production, gate access using an APISIX authentication plugin such as `key-auth` or `ip-restriction`, or restrict the header at your upstream WAF.
-
-### The semantic layer requires Redis Stack
-
-The `FT.CREATE` and `FT.SEARCH` commands used by the semantic layer come from the RediSearch module. Vanilla Redis will fail these commands and the layer will silently degrade to `MISS`. Use a Redis Stack image such as `redis/redis-stack:latest`.

From a8843cc10c1f55ddc5cf0f8eb2f07b34001047ee Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Tue, 5 May 2026 08:06:18 +0800
Subject: [PATCH 37/38] chore(ai-cache): fix lint

---
 t/plugin/ai-cache.t | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/t/plugin/ai-cache.t b/t/plugin/ai-cache.t
index b8e1557c2cca..abc652328ac8 100644
--- a/t/plugin/ai-cache.t
+++ b/t/plugin/ai-cache.t
@@ -133,7 +133,7 @@ passed
                 layers = { "semantic" },
                 redis_host = "127.0.0.1",
             })
-            
+
             if not ok then
                 ngx.say("failed: ", err)
             else

From a1f012841b946660d07a524a3e33771ae441b0ba Mon Sep 17 00:00:00 2001
From: janiussyafiq <izzraff.js@gmail.com>
Date: Wed, 6 May 2026 00:09:33 +0800
Subject: [PATCH 38/38] docs(ai-cache): refresh examples and tighten
 cache-status callouts

---
 docs/en/latest/plugins/ai-cache.md | 158 +++++++++++++++++++++++++----
 1 file changed, 136 insertions(+), 22 deletions(-)

diff --git a/docs/en/latest/plugins/ai-cache.md b/docs/en/latest/plugins/ai-cache.md
index 46a446afd0bb..523b727f836f 100644
--- a/docs/en/latest/plugins/ai-cache.md
+++ b/docs/en/latest/plugins/ai-cache.md
@@ -36,9 +36,9 @@ import TabItem from '@theme/TabItem';
 
 ## Description
 
-The `ai-cache` Plugin caches LLM responses in Redis so identical or semantically similar prompts are served from cache instead of incurring another upstream call. It supports two cache layers: an exact-match layer (`exact`) keyed by a hash of the prompt, and a semantic layer (`semantic`) that compares prompt embeddings via Redis Stack vector search. Either layer can be enabled independently, and a hit on the semantic layer backfills the exact layer so subsequent identical prompts return immediately.
+The `ai-cache` Plugin caches LLM responses in Redis so identical or semantically similar prompts are served from cache instead of incurring another upstream call. It supports two cache layers: an exact-match layer (`exact`) keyed by a hash of the prompt, and a semantic layer (`semantic`) that compares prompt embeddings via Redis Stack vector search. Either layer can be enabled independently, and when both are enabled a hit on the semantic layer backfills the exact layer so subsequent identical prompts return immediately.
 
-The Plugin should be used together with [ai-proxy](./ai-proxy.md) or [ai-proxy-multi](./ai-proxy-multi.md) on the same Route. The semantic layer requires Redis Stack with the RediSearch module and an embedding provider (OpenAI or Azure OpenAI).
+The Plugin should be used together with [ai-proxy](./ai-proxy.md) or [ai-proxy-multi](./ai-proxy-multi.md) on the same Route. The semantic layer requires Redis Stack with the RediSearch module and an embedding provider (OpenAI or Azure OpenAI). PRs for additional embedding providers are welcomed.
 
 ## Plugin Attributes
 
@@ -283,7 +283,7 @@ curl -i "http://127.0.0.1:9080/anything" -X POST \
   }'
 ```
 
-The first request reaches OpenAI and you should receive a response similar to the following:
+The first request reaches OpenAI. Note the `X-AI-Cache-Status: MISS` header, indicating the prompt was not in cache and APISIX forwarded the request upstream:
 
 ```text
 HTTP/1.1 200 OK
@@ -328,7 +328,7 @@ curl -i "http://127.0.0.1:9080/anything" -X POST \
   }'
 ```
 
-The second request returns from cache without contacting OpenAI. The cached response is replayed from Redis, so the body is shorter and does not contain the original `created`, `model`, `usage`, or `system_fingerprint` fields:
+The second request returns from cache without contacting OpenAI. The `X-AI-Cache-Status: HIT-L1` header signals an exact-match hit and `X-AI-Cache-Age` reports the entry's age in seconds. The cached response is replayed from Redis, so the body is shorter and does not contain the original `created`, `model`, `usage`, or `system_fingerprint` fields:
 
 ```text
 HTTP/1.1 200 OK
@@ -382,7 +382,7 @@ curl "http://127.0.0.1:9180/apisix/admin/routes/1" -X PUT \
         "layers": ["exact", "semantic"],
         "exact": { "ttl": 3600 },
         "semantic": {
-          "similarity_threshold": 0.92,
+          "similarity_threshold": 0.85,
           "ttl": 86400,
           "embedding": {
             "provider": "openai",
@@ -424,7 +424,7 @@ services:
             exact:
               ttl: 3600
             semantic:
-              similarity_threshold: 0.92
+              similarity_threshold: 0.85
               ttl: 86400
               embedding:
                 provider: openai
@@ -470,7 +470,7 @@ spec:
         exact:
           ttl: 3600
         semantic:
-          similarity_threshold: 0.92
+          similarity_threshold: 0.85
           ttl: 86400
           embedding:
             provider: openai
@@ -538,7 +538,7 @@ spec:
             exact:
               ttl: 3600
             semantic:
-              similarity_threshold: 0.92
+              similarity_threshold: 0.85
               ttl: 86400
               embedding:
                 provider: openai
@@ -567,12 +567,12 @@ curl -i "http://127.0.0.1:9080/anything" -X POST \
   -H "Content-Type: application/json" \
   -d '{
     "messages": [
-      { "role": "user", "content": "What is the capital of France?" }
+      { "role": "user", "content": "What is the capital city of China?" }
     ]
   }'
 ```
 
-The first request reaches OpenAI:
+The first request reaches OpenAI with `X-AI-Cache-Status: MISS`:
 
 ```text
 HTTP/1.1 200 OK
@@ -581,20 +581,20 @@ Server: APISIX/3.16.0
 X-AI-Cache-Status: MISS
 
 {
-  "id": "chatcmpl-Da7Iqsqz9gc8Mkf07Hn4NCzAH5Ri1",
+  "id": "chatcmpl-DcCIDs6ZJisclo84FUk5fT2Ks5vzn",
   "object": "chat.completion",
-  "model": "gpt-4o-mini-2024-07-18",
+  "model": "gpt-4-0613",
   "choices": [
     {
       "index": 0,
       "message": {
         "role": "assistant",
-        "content": "The capital of France is Paris."
+        "content": "The capital city of China is Beijing."
       },
       "finish_reason": "stop"
     }
   ],
-  "usage": { "prompt_tokens": 14, "completion_tokens": 7, "total_tokens": 21 }
+  "usage": { "prompt_tokens": 15, "completion_tokens": 8, "total_tokens": 23 }
 }
 ```
 
@@ -605,28 +605,28 @@ curl -i "http://127.0.0.1:9080/anything" -X POST \
   -H "Content-Type: application/json" \
   -d '{
     "messages": [
-      { "role": "user", "content": "capital of France what is?" }
+      { "role": "user", "content": "Capital city of China?" }
     ]
   }'
 ```
 
-The semantic layer matches the embedding (cosine similarity above the threshold) and returns the cached response without contacting OpenAI:
+The semantic layer matches the embedding (cosine similarity above the threshold) and returns the cached response without contacting OpenAI. The `X-AI-Cache-Status: HIT-L2` header signals a semantic-layer hit and `X-AI-Cache-Similarity` reports the cosine similarity score:
 
 ```text
 HTTP/1.1 200 OK
 Content-Type: application/json
 Server: APISIX/3.16.0
 X-AI-Cache-Status: HIT-L2
-X-AI-Cache-Similarity: 0.9720680713654
+X-AI-Cache-Similarity: 0.9065774679184
 
 {
-  "id": "40b612a5-1424-4096-b7ec-8537a1ee6fd3",
+  "id": "a95488bb-4a51-491a-bd5b-2c1d0e5f8a9b",
   "object": "chat.completion",
   "choices": [
     {
       "index": 0,
       "message": {
-        "content": "The capital of France is Paris.",
+        "content": "The capital city of China is Beijing.",
         "role": "assistant"
       },
       "finish_reason": "stop"
@@ -635,7 +635,7 @@ X-AI-Cache-Similarity: 0.9720680713654
 }
 ```
 
-A semantic-layer hit also backfills the exact layer, so an immediate retry of the same paraphrase returns `X-AI-Cache-Status: HIT-L1`.
+When the `exact` layer is also enabled (as in this example), a semantic-layer hit backfills it, so an immediate retry of the same paraphrase returns `X-AI-Cache-Status: HIT-L1`.
 
 ### Isolate Cache Entries Per Consumer or Tenant
 
@@ -823,7 +823,121 @@ kubectl apply -f ai-cache-ic.yaml
 </TabItem>
 </Tabs>
 
-Two requests with the same prompt but different `X-Tenant-Id` headers each receive `X-AI-Cache-Status: MISS`, because the cache key now includes the tenant identifier.
+Send a first request as `tenant-a`:
+
+```shell
+curl -i "http://127.0.0.1:9080/anything" -X POST \
+  -H "Content-Type: application/json" \
+  -H "X-Tenant-Id: tenant-a" \
+  -d '{
+    "messages": [
+      { "role": "user", "content": "What is the capital city of Japan?" }
+    ]
+  }'
+```
+
+The first request reaches OpenAI with `X-AI-Cache-Status: MISS` and primes `tenant-a`'s cache scope:
+
+```text
+HTTP/1.1 200 OK
+Content-Type: application/json
+Server: APISIX/3.16.0
+X-AI-Cache-Status: MISS
+
+{
+  "id": "chatcmpl-DcCRAzeSsimIOIeLQWsKtDxMLAAhu",
+  "object": "chat.completion",
+  "model": "gpt-4-0613",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "The capital city of Japan is Tokyo."
+      },
+      "finish_reason": "stop"
+    }
+  ],
+  "usage": { "prompt_tokens": 15, "completion_tokens": 8, "total_tokens": 23 }
+}
+```
+
+Repeat the same prompt as `tenant-a`:
+
+```shell
+curl -i "http://127.0.0.1:9080/anything" -X POST \
+  -H "Content-Type: application/json" \
+  -H "X-Tenant-Id: tenant-a" \
+  -d '{
+    "messages": [
+      { "role": "user", "content": "What is the capital city of Japan?" }
+    ]
+  }'
+```
+
+The second request returns from cache without contacting OpenAI. The `X-AI-Cache-Status: HIT-L1` header confirms `tenant-a`'s entry was reused:
+
+```text
+HTTP/1.1 200 OK
+Content-Type: application/json
+Server: APISIX/3.16.0
+X-AI-Cache-Status: HIT-L1
+X-AI-Cache-Age: 6
+
+{
+  "id": "6be4f7a2-83f1-4cdc-8654-cee0396bd4f3",
+  "object": "chat.completion",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "content": "The capital city of Japan is Tokyo.",
+        "role": "assistant"
+      },
+      "finish_reason": "stop"
+    }
+  ]
+}
+```
+
+Send the same prompt as a different tenant, `tenant-b`:
+
+```shell
+curl -i "http://127.0.0.1:9080/anything" -X POST \
+  -H "Content-Type: application/json" \
+  -H "X-Tenant-Id: tenant-b" \
+  -d '{
+    "messages": [
+      { "role": "user", "content": "What is the capital city of Japan?" }
+    ]
+  }'
+```
+
+Even though the prompt is identical, the request reaches OpenAI with `X-AI-Cache-Status: MISS` because `tenant-b` has its own cache scope:
+
+```text
+HTTP/1.1 200 OK
+Content-Type: application/json
+Server: APISIX/3.16.0
+X-AI-Cache-Status: MISS
+
+{
+  "id": "chatcmpl-DcCROH92JLWcgyhSpwEoutTvqnew5",
+  "object": "chat.completion",
+  "model": "gpt-4-0613",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "The capital city of Japan is Tokyo."
+      },
+      "finish_reason": "stop"
+    }
+  ],
+  "usage": { "prompt_tokens": 15, "completion_tokens": 8, "total_tokens": 23 }
+}
+```
 
 ### Bypass the Cache on a Header
 
@@ -1020,7 +1134,7 @@ curl -i "http://127.0.0.1:9080/anything" -X POST \
   }'
 ```
 
-The request reaches OpenAI even though a cached entry exists, and the response is not written back to cache. You can confirm the upstream was contacted because the response includes the original `created`, `model`, `usage`, and `system_fingerprint` fields:
+The request reaches OpenAI even though a cached entry exists, and the response is not written back to cache. The `X-AI-Cache-Status: BYPASS` header confirms the cache was skipped, and the response includes the original `created`, `model`, `usage`, and `system_fingerprint` fields, verifying the upstream was contacted:
 
 ```text
 HTTP/1.1 200 OK