diff --git a/Makefile b/Makefile
index 71ab7df1eabf..ad074ecabbdf 100644
--- a/Makefile
+++ b/Makefile
@@ -388,6 +388,11 @@ install: runtime
 	$(ENV_INSTALL) -d $(ENV_INST_LUADIR)/apisix/plugins/ai-transport
 	$(ENV_INSTALL) apisix/plugins/ai-transport/*.lua $(ENV_INST_LUADIR)/apisix/plugins/ai-transport
 
+	$(ENV_INSTALL) -d $(ENV_INST_LUADIR)/apisix/plugins/ai-cache
+	$(ENV_INSTALL) apisix/plugins/ai-cache/*.lua $(ENV_INST_LUADIR)/apisix/plugins/ai-cache
+	$(ENV_INSTALL) -d $(ENV_INST_LUADIR)/apisix/plugins/ai-cache/embeddings
+	$(ENV_INSTALL) apisix/plugins/ai-cache/embeddings/*.lua $(ENV_INST_LUADIR)/apisix/plugins/ai-cache/embeddings
+
 	$(ENV_INSTALL) -d $(ENV_INST_LUADIR)/apisix/plugins/ai-rag/embeddings
 	$(ENV_INSTALL) apisix/plugins/ai-rag/embeddings/*.lua $(ENV_INST_LUADIR)/apisix/plugins/ai-rag/embeddings
 	$(ENV_INSTALL) -d $(ENV_INST_LUADIR)/apisix/plugins/ai-rag/vector-search
diff --git a/apisix/cli/config.lua b/apisix/cli/config.lua
index 956eef30c267..b4df580666c0 100644
--- a/apisix/cli/config.lua
+++ b/apisix/cli/config.lua
@@ -231,6 +231,7 @@ local _M = {
     "ai-prompt-template",
     "ai-prompt-decorator",
     "ai-prompt-guard",
+    "ai-cache",
     "ai-rag",
     "ai-rate-limiting",
     "ai-proxy-multi",
diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua
new file mode 100644
index 000000000000..476bd785d85a
--- /dev/null
+++ b/apisix/plugins/ai-cache.lua
@@ -0,0 +1,296 @@
+--
+-- Licensed to the Apache Software Foundation (ASF) under one or more
+-- contributor license agreements.  See the NOTICE file distributed with
+-- this work for additional information regarding copyright ownership.
+-- The ASF licenses this file to You under the Apache License, Version 2.0
+-- (the "License"); you may not use this file except in compliance with
+-- the License.  You may obtain a copy of the License at
+--
+--     http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-- See the License for the specific language governing permissions and
+-- limitations under the License.
+--
+
+local core      = require("apisix.core")
+local schema    = require("apisix.plugins.ai-cache.schema")
+local exact     = require("apisix.plugins.ai-cache.exact")
+local semantic  = require("apisix.plugins.ai-cache.semantic")
+local protocols = require("apisix.plugins.ai-protocols")
+local http      = require("resty.http")
+local ngx          = ngx
+local ngx_time     = ngx.time
+local ngx_now      = ngx.now
+local ipairs       = ipairs
+local require      = require
+local tostring     = tostring
+local table_concat = table.concat
+
+local plugin_name = "ai-cache"
+
+local _M = {
+    version = 0.1,
+    priority = 1065,
+    name = plugin_name,
+    schema = schema.schema
+}
+
+
+local function layer_enabled(conf, name)
+    local layers = conf.layers or { "exact", "semantic" }
+    for _, l in ipairs(layers) do
+        if l == name then return true end
+    end
+    return false
+end
+
+
+local function populate_ai_ctx_on_hit(ctx, protocol_name, body_tab, is_stream, cached_text)
+    ctx.ai_client_protocol = protocol_name
+    ctx.var.request_type = is_stream and "ai_stream" or "ai_chat"
+    if body_tab.model then
+        ctx.var.request_llm_model = body_tab.model
+        ctx.var.llm_model = body_tab.model
+    end
+    ctx.var.llm_response_text = cached_text
+end
+
+
+function _M.check_schema(conf)
+    local ok, err = core.schema.check(schema.schema, conf)
+    if not ok then
+        return false, err
+    end
+
+    if layer_enabled(conf, "semantic") then
+        if not (conf.semantic and conf.semantic.embedding) then
+            return false, "semantic layer requires semantic.embedding to be configured"
+        end
+    end
+
+    core.utils.check_https({ "semantic.embedding.endpoint" }, conf, plugin_name)
+
+    return true
+end
+
+
+function _M.access(conf, ctx)
+    -- Check bypass_on conditions
+    if conf.bypass_on then
+        local req_headers = ngx.req.get_headers()
+        for _, rule in ipairs(conf.bypass_on) do
+            if req_headers[rule.header] == rule.equals then
+                ctx.ai_cache_status = "BYPASS"
+                return
+            end
+        end
+    end
+
+    local body_tab, err = core.request.get_json_request_body_table()
+    if not body_tab then
+        core.log.warn("ai-cache: failed to read request body: ", err or "unknown error")
+        ctx.ai_cache_status = "MISS"
+        return
+    end
+
+    local protocol_name = protocols.detect(body_tab, ctx)
+    if not protocol_name then
+        core.log.warn("ai-cache: could not detect AI protocol, skipping cache")
+        ctx.ai_cache_status = "MISS"
+        return
+    end
+
+    local proto = protocols.get(protocol_name)
+    local contents = proto.extract_request_content(body_tab)
+    if not contents or #contents == 0 then
+        ctx.ai_cache_status = "MISS"
+        return
+    end
+
+    local prompt_text = table_concat(contents, " ")
+    local scope_hash = exact.compute_scope_hash(conf, ctx)
+    local prompt_hash = exact.compute_prompt_hash(prompt_text)
+
+    local is_stream = body_tab.stream == true
+
+    -- L1 exact lookup
+    if layer_enabled(conf, "exact") then
+        local cached_text, written_at, lookup_err = exact.get(conf, scope_hash, prompt_hash)
+        if lookup_err then
+            core.log.warn("ai-cache: L1 lookup error: ", lookup_err)
+        elseif cached_text then
+            core.log.info("ai-cache: L1 hit for key: ", prompt_hash)
+            ctx.ai_cache_status = "HIT-L1"
+            ctx.ai_cache_written_at = written_at
+            if is_stream then
+                core.response.set_header("Content-Type", "text/event-stream")
+            else
+                core.response.set_header("Content-Type", "application/json")
+            end
+            populate_ai_ctx_on_hit(ctx, protocol_name, body_tab, is_stream, cached_text)
+            -- TODO: rename build_deny_response to build_response_from_text in a
+            -- follow-up. We use it here to wrap cached text in the protocol's
+            -- response shape, not for policy denial.
+            return 200, proto.build_deny_response({
+                stream = is_stream,
+                text = cached_text,
+            })
+        end
+    end
+
+    -- L2 semantic lookup
+    if layer_enabled(conf, "semantic") then
+        local emb_conf = conf.semantic.embedding
+        local emb_driver = require("apisix.plugins.ai-cache.embeddings." .. emb_conf.provider)
+        local httpc = http.new()
+
+        local t0 = ngx_now()
+        local embedding, _, emb_err = emb_driver.get_embeddings(
+            emb_conf, prompt_text, httpc, emb_conf.ssl_verify
+        )
+        if not embedding then
+            core.log.warn("ai-cache: embedding fetch failed (degrading to MISS): ", emb_err)
+            ctx.ai_cache_embedding_failed = true
+        else
+            ctx.ai_cache_embedding_latency_ms = (ngx_now() - t0) * 1000
+            ctx.ai_cache_embedding_provider = emb_conf.provider
+            ctx.ai_cache_embedding = embedding
+
+            local threshold = conf.semantic.similarity_threshold or 0.95
+            local cached_text, similarity, search_err = semantic.search(
+                conf, scope_hash, embedding, threshold
+            )
+
+            if search_err then
+                core.log.warn("ai-cache: L2 search error (degrading to MISS): ", search_err)
+            elseif cached_text then
+                core.log.info("ai-cache: L2 hit, similarity=", similarity)
+
+                if layer_enabled(conf, "exact") then
+                    local l1_ttl = (conf.exact and conf.exact.ttl) or 3600
+                    local l1_err = exact.set(
+                        conf, scope_hash, prompt_hash, cached_text, l1_ttl
+                    )
+                    if l1_err then
+                        core.log.warn("ai-cache: L2->L1 backfill failed: ", l1_err)
+                    end
+                end
+
+                ctx.ai_cache_status = "HIT-L2"
+                ctx.ai_cache_similarity = similarity
+                if is_stream then
+                    core.response.set_header("Content-Type", "text/event-stream")
+                else
+                    core.response.set_header("Content-Type", "application/json")
+                end
+                populate_ai_ctx_on_hit(ctx, protocol_name, body_tab, is_stream, cached_text)
+                return 200, proto.build_deny_response({
+                    stream = is_stream,
+                    text = cached_text,
+                })
+            end
+        end
+    end
+
+    ctx.ai_cache_status = "MISS"
+    ctx.ai_cache_scope_hash  = scope_hash
+    ctx.ai_cache_prompt_hash = prompt_hash
+    ctx.ai_cache_prompt_text = prompt_text
+end
+
+
+function _M.header_filter(conf, ctx)
+    if not ctx.ai_cache_status then
+        return
+    end
+
+    local status_header = (conf.headers and conf.headers.cache_status)
+                            or "X-AI-Cache-Status"
+    ngx.header[status_header] = ctx.ai_cache_status
+
+    if ctx.ai_cache_status == "HIT-L1" and ctx.ai_cache_written_at then
+        local age_header = (conf.headers and conf.headers.cache_age)
+                            or "X-AI-Cache-Age"
+        ngx.header[age_header] = tostring(ngx_time() - ctx.ai_cache_written_at)
+    end
+
+    if ctx.ai_cache_status == "HIT-L2" and ctx.ai_cache_similarity then
+        local sim_header = (conf.headers and conf.headers.cache_similarity)
+                            or "X-AI-Cache-Similarity"
+        ngx.header[sim_header] = tostring(ctx.ai_cache_similarity)
+    end
+end
+
+
+function _M.log(conf, ctx)
+    if ctx.ai_cache_status ~= "MISS" then
+        return
+    end
+
+    -- Early-MISS paths (body parse / protocol detect / empty content) skip
+    -- key computation, so bail out if cache key fields are absent.
+    if not ctx.ai_cache_prompt_hash or not ctx.ai_cache_prompt_text then
+        return
+    end
+
+    local upstream_status = core.response.get_upstream_status(ctx) or ngx.status
+    if not upstream_status or upstream_status < 200 or upstream_status >= 300 then
+        return
+    end
+
+    local response_text = ctx.var.llm_response_text
+    if not response_text or response_text == "" then
+        return
+    end
+
+    local max_size = conf.max_cache_body_size or 1048576
+    if #response_text > max_size then
+        core.log.warn("ai-cache: response size ", #response_text,
+                      " exceeds max_cache_body_size ", max_size,
+                      ", skipping cache write")
+        return
+    end
+
+    local exact_enabled = layer_enabled(conf, "exact")
+    local semantic_enabled = layer_enabled(conf, "semantic")
+    local ttl_exact = (conf.exact and conf.exact.ttl) or 3600
+    local scope_hash = ctx.ai_cache_scope_hash
+    local prompt_hash = ctx.ai_cache_prompt_hash
+    local embedding = ctx.ai_cache_embedding
+
+    local ok, timer_err = ngx.timer.at(0, function(premature)
+        if premature then
+            return
+        end
+
+        if exact_enabled then
+            local err = exact.set(conf, scope_hash, prompt_hash, response_text, ttl_exact)
+            if err then
+                ngx.log(ngx.WARN, "ai-cache: failed to write L1 cache: ", err)
+            end
+        end
+
+        if semantic_enabled then
+            if not embedding then
+                return
+            end
+
+            local ttl_semantic = (conf.semantic and conf.semantic.ttl) or 86400
+            local store_err = semantic.store(
+                conf, scope_hash, embedding, response_text, ttl_semantic
+            )
+            if store_err then
+                ngx.log(ngx.WARN, "ai-cache: failed to write L2 cache: ", store_err)
+            end
+        end
+    end)
+    if not ok then
+        core.log.warn("ai-cache: failed to schedule cache write: ", timer_err)
+    end
+end
+
+
+return _M
diff --git a/apisix/plugins/ai-cache/embeddings/azure_openai.lua b/apisix/plugins/ai-cache/embeddings/azure_openai.lua
new file mode 100644
index 000000000000..6f862ea78cc8
--- /dev/null
+++ b/apisix/plugins/ai-cache/embeddings/azure_openai.lua
@@ -0,0 +1,76 @@
+--
+-- Licensed to the Apache Software Foundation (ASF) under one or more
+-- contributor license agreements.  See the NOTICE file distributed with
+-- this work for additional information regarding copyright ownership.
+-- The ASF licenses this file to You under the Apache License, Version 2.0
+-- (the "License"); you may not use this file except in compliance with
+-- the License.  You may obtain a copy of the License at
+--
+--     http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-- See the License for the specific language governing permissions and
+-- limitations under the License.
+--
+
+local core = require("apisix.core")
+local type = type
+
+local ngx                        = ngx
+local HTTP_OK                    = ngx.HTTP_OK
+local HTTP_INTERNAL_SERVER_ERROR = ngx.HTTP_INTERNAL_SERVER_ERROR
+
+local _M = {}
+
+
+function _M.get_embeddings(conf, text, httpc, ssl_verify)
+    local body, err = core.json.encode({ input = text })
+    if not body then
+        return nil, HTTP_INTERNAL_SERVER_ERROR, err
+    end
+
+    httpc:set_timeout(conf.timeout)
+
+    local res, err = httpc:request_uri(conf.endpoint, {
+        method = "POST",
+        headers = {
+            ["Content-Type"] = "application/json",
+            ["api-key"] = conf.api_key,
+        },
+        body = body,
+        ssl_verify = ssl_verify,
+        keepalive = true,
+    })
+
+    if not res or not res.body then
+        return nil, HTTP_INTERNAL_SERVER_ERROR, err or "no response from embeddings API"
+    end
+
+    if res.status ~= HTTP_OK then
+        return nil, res.status, res.body
+    end
+
+    local res_tab, err = core.json.decode(res.body)
+    if not res_tab then
+        return nil, HTTP_INTERNAL_SERVER_ERROR, err
+    end
+
+    if type(res_tab.data) ~= "table" or core.table.isempty(res_tab.data) then
+        return nil, HTTP_INTERNAL_SERVER_ERROR, "unexpected embedding response: " .. res.body
+    end
+
+    local embedding = res_tab.data[1].embedding
+    if type(embedding) ~= "table" then
+        return nil, HTTP_INTERNAL_SERVER_ERROR, "missing embedding field in response"
+    end
+    if #embedding == 0 then
+        return nil, HTTP_INTERNAL_SERVER_ERROR, "embedding vector is empty"
+    end
+
+    return embedding, nil, nil
+end
+
+
+return _M
diff --git a/apisix/plugins/ai-cache/embeddings/openai.lua b/apisix/plugins/ai-cache/embeddings/openai.lua
new file mode 100644
index 000000000000..740b12d23f2d
--- /dev/null
+++ b/apisix/plugins/ai-cache/embeddings/openai.lua
@@ -0,0 +1,79 @@
+--
+-- Licensed to the Apache Software Foundation (ASF) under one or more
+-- contributor license agreements.  See the NOTICE file distributed with
+-- this work for additional information regarding copyright ownership.
+-- The ASF licenses this file to You under the Apache License, Version 2.0
+-- (the "License"); you may not use this file except in compliance with
+-- the License.  You may obtain a copy of the License at
+--
+--     http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-- See the License for the specific language governing permissions and
+-- limitations under the License.
+--
+
+local core = require("apisix.core")
+local type = type
+
+local ngx                        = ngx
+local HTTP_OK                    = ngx.HTTP_OK
+local HTTP_INTERNAL_SERVER_ERROR = ngx.HTTP_INTERNAL_SERVER_ERROR
+
+local _M = {}
+
+
+function _M.get_embeddings(conf, text, httpc, ssl_verify)
+    local body, err = core.json.encode({
+        input = text,
+        model = conf.model or "text-embedding-3-small",
+    })
+    if not body then
+        return nil, HTTP_INTERNAL_SERVER_ERROR, err
+    end
+
+    httpc:set_timeout(conf.timeout)
+
+    local res, err = httpc:request_uri(conf.endpoint, {
+        method = "POST",
+        headers = {
+            ["Content-Type"] = "application/json",
+            ["Authorization"] = "Bearer " .. conf.api_key,
+        },
+        body = body,
+        ssl_verify = ssl_verify,
+        keepalive = true,
+    })
+
+    if not res or not res.body then
+        return nil, HTTP_INTERNAL_SERVER_ERROR, err or "no response from embeddings API"
+    end
+
+    if res.status ~= HTTP_OK then
+        return nil, res.status, res.body
+    end
+
+    local res_tab, err = core.json.decode(res.body)
+    if not res_tab then
+        return nil, HTTP_INTERNAL_SERVER_ERROR, err
+    end
+
+    if type(res_tab.data) ~= "table" or core.table.isempty(res_tab.data) then
+        return nil, HTTP_INTERNAL_SERVER_ERROR, "unexpected embedding response: " .. res.body
+    end
+
+    local embedding = res_tab.data[1].embedding
+    if type(embedding) ~= "table" then
+        return nil, HTTP_INTERNAL_SERVER_ERROR, "missing embedding field in response"
+    end
+    if #embedding == 0 then
+        return nil, HTTP_INTERNAL_SERVER_ERROR, "embedding vector is empty"
+    end
+
+    return embedding, nil, nil
+end
+
+
+return _M
diff --git a/apisix/plugins/ai-cache/exact.lua b/apisix/plugins/ai-cache/exact.lua
new file mode 100644
index 000000000000..e1a63f9a5f41
--- /dev/null
+++ b/apisix/plugins/ai-cache/exact.lua
@@ -0,0 +1,136 @@
+--
+-- Licensed to the Apache Software Foundation (ASF) under one or more
+-- contributor license agreements.  See the NOTICE file distributed with
+-- this work for additional information regarding copyright ownership.
+-- The ASF licenses this file to You under the Apache License, Version 2.0
+-- (the "License"); you may not use this file except in compliance with
+-- the License.  You may obtain a copy of the License at
+--
+--     http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-- See the License for the specific language governing permissions and
+-- limitations under the License.
+--
+
+local core         = require("apisix.core")
+local redis        = require("apisix.utils.redis")
+local resty_sha256 = require("resty.sha256")
+local to_hex       = require("resty.string").to_hex
+
+local ngx           = ngx
+local ngx_time      = ngx.time
+local ipairs        = ipairs
+local tostring      = tostring
+local table_concat  = table.concat
+
+local KEY_PREFIX = "ai-cache:l1:"
+
+local _M = {}
+
+
+local function sha256_hex(s)
+    local hash = resty_sha256:new()
+    hash:update(s)
+    return to_hex(hash:final())
+end
+
+
+function _M.compute_scope_hash(conf, ctx)
+    local cache_key = conf.cache_key
+    if not cache_key then
+        return ""
+    end
+
+    local parts = {}
+    local n = 0
+
+    if cache_key.include_consumer then
+        n = n + 1
+        parts[n] = ctx.consumer_name or ""
+    end
+
+    if cache_key.include_vars then
+        for _, var_name in ipairs(cache_key.include_vars) do
+            local key = var_name
+            if key:sub(1, 1) == "$" then
+                key = key:sub(2)
+            end
+            n = n + 1
+            parts[n] = tostring(ctx.var[key] or "")
+        end
+    end
+
+    if n == 0 then
+        return ""
+    end
+
+    return sha256_hex(table_concat(parts, "|"))
+end
+
+
+function _M.compute_prompt_hash(text)
+    return sha256_hex(text)
+end
+
+
+function _M.get(conf, scope_hash, prompt_hash)
+    local red, err = redis.new(conf)
+    if not red then
+        return nil, nil, err
+    end
+
+    local key = KEY_PREFIX .. scope_hash .. ":" .. prompt_hash
+    local res, get_err = red:get(key)
+    if get_err then
+        red:close()
+        return nil, nil, get_err
+    end
+
+    red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool)
+
+    if res == ngx.null then
+        return nil, nil, nil
+    end
+
+    local entry, decode_err = core.json.decode(res)
+    if not entry then
+        return nil, nil, "corrupt cache entry: " .. decode_err
+    end
+
+    return entry.text, entry.written_at, nil
+end
+
+
+function _M.set(conf, scope_hash, prompt_hash, text, ttl)
+    local red, err = redis.new(conf)
+    if not red then
+        return err
+    end
+
+    local key = KEY_PREFIX .. scope_hash .. ":" .. prompt_hash
+    local entry, encode_err = core.json.encode({
+        text = text,
+        written_at = ngx_time(),
+    })
+
+    if not entry then
+        red:close()
+        return encode_err
+    end
+
+    local ok, set_err = red:set(key, entry, "EX", ttl)
+    if not ok then
+        red:close()
+        return set_err
+    end
+
+    red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool)
+
+    return nil
+end
+
+
+return _M
diff --git a/apisix/plugins/ai-cache/schema.lua b/apisix/plugins/ai-cache/schema.lua
new file mode 100644
index 000000000000..02587f7fb14c
--- /dev/null
+++ b/apisix/plugins/ai-cache/schema.lua
@@ -0,0 +1,189 @@
+--
+-- Licensed to the Apache Software Foundation (ASF) under one or more
+-- contributor license agreements.  See the NOTICE file distributed with
+-- this work for additional information regarding copyright ownership.
+-- The ASF licenses this file to You under the Apache License, Version 2.0
+-- (the "License"); you may not use this file except in compliance with
+-- the License.  You may obtain a copy of the License at
+--
+--     http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-- See the License for the specific language governing permissions and
+-- limitations under the License.
+--
+
+local redis_schema = require("apisix.utils.redis-schema")
+
+local _M = {}
+
+local embedding_schema = {
+    type = "object",
+    properties = {
+        provider = {
+            type = "string",
+            enum = { "openai", "azure_openai" },
+            description = "Embedding API provider.",
+        },
+        model = {
+            type = "string",
+            description = "Embedding model name. Sent in the request body for "
+                       .. "provider: openai; ignored for provider: azure_openai "
+                       .. "(Azure infers the model from the deployment URL).",
+        },
+        endpoint = {
+            type = "string",
+            description = "Embedding API endpoint URL.",
+        },
+        api_key = {
+            type = "string",
+            description = "API key for the embedding provider.",
+        },
+        timeout = {
+            type = "integer",
+            minimum = 1,
+            maximum = 600000,
+            default = 5000,
+            description = "HTTP request timeout in milliseconds for embedding API calls.",
+        },
+        ssl_verify = {
+            type = "boolean",
+            default = true,
+            description = "Whether to verify the embedding endpoint's TLS certificate.",
+        },
+    },
+    required = { "provider", "endpoint", "api_key" },
+}
+
+local semantic_schema = {
+    type = "object",
+    properties = {
+        similarity_threshold = {
+            type = "number",
+            minimum = 0,
+            maximum = 1,
+            default = 0.95,
+            description = "Minimum cosine similarity required for a semantic-layer hit.",
+        },
+        top_k = {
+            type = "integer",
+            minimum = 1,
+            maximum = 100,
+            default = 1,
+            description = "Number of nearest-neighbor candidates the index returns; "
+                       .. "the first candidate above similarity_threshold is used.",
+        },
+        ttl = {
+            type = "integer",
+            minimum = 1,
+            default = 86400,
+            description = "Time-to-live in seconds for semantic-layer entries.",
+        },
+        embedding = embedding_schema,
+    },
+    required = { "embedding" },
+}
+
+local exact_schema = {
+    type = "object",
+    properties = {
+        ttl = {
+            type = "integer",
+            minimum = 1,
+            default = 3600,
+            description = "Time-to-live in seconds for exact-layer entries.",
+        },
+    },
+}
+
+
+local bypass_item_schema = {
+    type = "object",
+    properties = {
+        header = {
+            type = "string",
+            description = "Request header name to inspect.",
+        },
+        equals = {
+            type = "string",
+            description = "Value to match against the header. "
+                       .. "If equal, the request bypasses the cache.",
+        },
+    },
+    required = { "header", "equals" },
+}
+
+local headers_schema = {
+    type = "object",
+    properties = {
+        cache_status = {
+            type = "string",
+            default = "X-AI-Cache-Status",
+            description = "Response header name for cache status "
+                       .. "(HIT-L1 / HIT-L2 / MISS / BYPASS).",
+        },
+        cache_similarity = {
+            type = "string",
+            default = "X-AI-Cache-Similarity",
+            description = "Response header name for the similarity score of a semantic-layer hit.",
+        },
+        cache_age = {
+            type = "string",
+            default = "X-AI-Cache-Age",
+            description = "Response header name for the age in seconds of an exact-layer hit.",
+        },
+    },
+}
+
+_M.schema = {
+    type = "object",
+    properties = {
+        layers = {
+            type = "array",
+            items = { type = "string", enum = { "exact", "semantic" } },
+            uniqueItems = true,
+            minItems = 1,
+            default = { "exact", "semantic" },
+            description = "Cache layers to enable, queried in order.",
+        },
+        cache_key = {
+            type = "object",
+            properties = {
+                include_consumer = {
+                    type = "boolean",
+                    default = false,
+                    description = "If true, partition the cache by consumer name.",
+                },
+                include_vars = {
+                    type = "array",
+                    items = { type = "string" },
+                    default = {},
+                    description = "Additional ctx.var names included in the cache key, "
+                               .. "for example [\"$http_x_tenant_id\"].",
+                },
+            },
+        },
+        exact = exact_schema,
+        semantic = semantic_schema,
+        bypass_on = {
+            type = "array",
+            items = bypass_item_schema,
+            description = "List of {header, equals} rules. "
+                       .. "If any matches, the request bypasses the cache.",
+        },
+        headers = headers_schema,
+        max_cache_body_size = {
+            type = "integer",
+            minimum = 1,
+            default = 1048576,
+            description = "Maximum response size in bytes to write to cache. "
+                       .. "Larger responses pass through but are not cached.",
+        },
+    },
+    allOf = { redis_schema.schema.redis },
+    encrypt_fields = { "semantic.embedding.api_key", "redis_password" },
+}
+
+return _M
diff --git a/apisix/plugins/ai-cache/semantic.lua b/apisix/plugins/ai-cache/semantic.lua
new file mode 100644
index 000000000000..6d84dbd28675
--- /dev/null
+++ b/apisix/plugins/ai-cache/semantic.lua
@@ -0,0 +1,212 @@
+--
+-- Licensed to the Apache Software Foundation (ASF) under one or more
+-- contributor license agreements.  See the NOTICE file distributed with
+-- this work for additional information regarding copyright ownership.
+-- The ASF licenses this file to You under the Apache License, Version 2.0
+-- (the "License"); you may not use this file except in compliance with
+-- the License.  You may obtain a copy of the License at
+--
+--     http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-- See the License for the specific language governing permissions and
+-- limitations under the License.
+--
+
+local apisix_redis = require("apisix.utils.redis")
+local uuid = require("resty.jit-uuid")
+local ffi = require("ffi")
+
+local ffi_new = ffi.new
+local ffi_string = ffi.string
+local ngx_time = ngx.time
+local tostring = tostring
+local tonumber = tonumber
+local type = type
+
+local _M = {}
+
+
+local function index_name(dim)
+    return "ai-cache-idx-" .. dim
+end
+
+
+local function key_prefix(dim)
+    return "ai-cache:l2:" .. dim .. ":"
+end
+
+local function pack_vector(vec)
+    local n = #vec
+    local buf = ffi_new("float[?]", n)
+    for i = 0, n - 1 do
+        buf[i] = vec[i + 1]
+    end
+    return ffi_string(buf, n * 4)
+end
+
+local index_ready = {}
+local index_unsupported = false
+
+local function ensure_index(red, dim)
+    if index_unsupported then
+        return nil, "RediSearch not supported on this Redis instance"
+    end
+
+    if index_ready[dim] then
+        return true
+    end
+
+    local _, err = red["FT.CREATE"](red,
+        index_name(dim),
+        "ON", "HASH",
+        "PREFIX", "1", key_prefix(dim),
+        "SCHEMA",
+        "embedding", "VECTOR", "HNSW", "6",
+        "TYPE", "FLOAT32",
+        "DIM", tostring(dim),
+        "DISTANCE_METRIC", "COSINE",
+        "scope", "TAG",
+        "created_at", "NUMERIC"
+    )
+
+    if err then
+        -- RediSearch module absent — latch and stop retrying on every request
+        if err:find("unknown command", 1, true)
+           or err:find("ERR unknown", 1, true) then
+            index_unsupported = true
+            return nil, "RediSearch not supported on this Redis instance: " .. err
+        end
+        if not err:find("already exists") then
+            return nil, "FT.CREATE failed: " .. err
+        end
+    end
+
+    index_ready[dim] = true
+    return true
+end
+
+
+function _M.search(conf, scope_hash, embedding_vec, threshold)
+    local red, err = apisix_redis.new(conf)
+    if not red then
+        return nil, nil, err
+    end
+
+    local ok, init_err = ensure_index(red, #embedding_vec)
+    if not ok then
+        red:close()
+        return nil, nil, init_err
+    end
+
+    local binary_vec = pack_vector(embedding_vec)
+    local top_k = (conf.semantic and conf.semantic.top_k) or 1
+    local top_k_str = tostring(top_k)
+
+    local query
+    if scope_hash == "" then
+        query = "*=>[KNN " .. top_k_str .. " @embedding $vec AS dist]"
+    else
+        query = "@scope:{" .. scope_hash .. "}=>[KNN " .. top_k_str
+                .. " @embedding $vec AS dist]"
+    end
+
+    local res, search_err = red["FT.SEARCH"](red,
+        index_name(#embedding_vec),
+        query,
+        "PARAMS", "2", "vec", binary_vec,
+        "SORTBY", "dist", "ASC",
+        "LIMIT", "0", top_k_str,
+        "RETURN", "2", "response", "dist",
+        "DIALECT", "2"
+    )
+
+    if search_err then
+        red:close()
+        -- index was dropped externally — invalidate so next call recreates
+        if search_err:find("Unknown Index name", 1, true) then
+            index_ready[#embedding_vec] = nil
+        end
+        return nil, nil, search_err
+    end
+
+    red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool)
+
+    if not res or res[1] == 0 then
+        return nil, nil, nil
+    end
+
+    -- RESP2: {count, key1, fields1, key2, fields2, ...}
+    -- Results are sorted by dist ASC. Iterate candidates and return the first
+    -- one whose similarity meets the threshold; skip candidates with missing
+    -- or corrupt fields.
+    for i = 3, #res, 2 do
+        local fields = res[i]
+        if type(fields) == "table" then
+            local response_text, dist
+            for j = 1, #fields, 2 do
+                if fields[j] == "response" then
+                    response_text = fields[j + 1]
+                elseif fields[j] == "dist" then
+                    dist = tonumber(fields[j + 1])
+                end
+            end
+
+            if response_text and dist then
+                local similarity = 1 - dist
+                if similarity >= threshold then
+                    return response_text, similarity, nil
+                end
+            end
+        end
+    end
+
+    return nil, nil, nil
+end
+
+
+function _M.store(conf, scope_hash, embedding_vec, text, ttl)
+    local red, err = apisix_redis.new(conf)
+    if not red then
+        return err
+    end
+
+    local ok, init_err = ensure_index(red, #embedding_vec)
+    if not ok then
+        red:close()
+        return init_err
+    end
+
+    local binary_vec = pack_vector(embedding_vec)
+    local key = key_prefix(#embedding_vec) .. uuid.generate_v4()
+
+    -- HSET + EXPIRE wrapped in MULTI/EXEC so the entry is never written
+    -- without its TTL (which would orphan it in Redis forever).
+    local _, multi_err = red:multi()
+    if multi_err then
+        red:close()
+        return multi_err
+    end
+
+    red:hset(key,
+        "embedding", binary_vec,
+        "response", text,
+        "scope", scope_hash,
+        "created_at", tostring(ngx_time())
+    )
+    red:expire(key, ttl)
+
+    local results, exec_err = red:exec()
+    if not results then
+        red:close()
+        return exec_err
+    end
+
+    red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool)
+    return nil
+end
+
+
+return _M
diff --git a/apisix/plugins/prometheus/exporter.lua b/apisix/plugins/prometheus/exporter.lua
index ce89ca03302a..78ce1bac0bf5 100644
--- a/apisix/plugins/prometheus/exporter.lua
+++ b/apisix/plugins/prometheus/exporter.lua
@@ -160,6 +160,14 @@ function _M.http_init(prometheus_enabled_in_stream)
                                                             "llm_completion_tokens", "expire")
     local llm_active_connections_exptime = core.table.try_read_attr(attr, "metrics",
                                                             "llm_active_connections", "expire")
+    local ai_cache_hits_exptime = core.table.try_read_attr(attr, "metrics",
+                                                            "ai_cache_hits", "expire")
+    local ai_cache_misses_exptime = core.table.try_read_attr(attr, "metrics",
+                                                            "ai_cache_misses", "expire")
+    local ai_cache_embedding_latency_exptime = core.table.try_read_attr(attr, "metrics",
+                                                            "ai_cache_embedding_latency", "expire")
+    local ai_cache_embedding_failures_exptime = core.table.try_read_attr(attr, "metrics",
+                                                            "ai_cache_embedding_failures", "expire")
 
     prometheus = base_prometheus.init("prometheus-metrics", metric_prefix)
 
@@ -260,6 +268,35 @@ function _M.http_init(prometheus_enabled_in_stream)
             unpack(extra_labels("llm_active_connections"))},
             llm_active_connections_exptime)
 
+    metrics.ai_cache_hits = prometheus:counter("ai_cache_hits_total",
+            "AI cache hit count by layer",
+            {"route_id", "service_id", "consumer", "layer",
+            unpack(extra_labels("ai_cache_hits"))},
+            ai_cache_hits_exptime)
+
+    metrics.ai_cache_misses = prometheus:counter("ai_cache_misses_total",
+            "AI cache miss count",
+            {"route_id", "service_id", "consumer",
+            unpack(extra_labels("ai_cache_misses"))},
+            ai_cache_misses_exptime)
+
+    local ai_cache_embedding_latency_buckets = DEFAULT_BUCKETS
+    if attr and attr.ai_cache_embedding_latency_buckets then
+        ai_cache_embedding_latency_buckets = attr.ai_cache_embedding_latency_buckets
+    end
+    metrics.ai_cache_embedding_latency = prometheus:histogram("ai_cache_embedding_latency",
+            "AI cache embedding API call latency in milliseconds",
+            {"route_id", "service_id", "consumer", "provider",
+            unpack(extra_labels("ai_cache_embedding_latency"))},
+            ai_cache_embedding_latency_buckets,
+            ai_cache_embedding_latency_exptime)
+
+    metrics.ai_cache_embedding_failures = prometheus:counter("ai_cache_embedding_failures_total",
+            "AI cache embedding API call failure count",
+            {"route_id", "service_id", "consumer",
+            unpack(extra_labels("ai_cache_embedding_failures"))},
+            ai_cache_embedding_failures_exptime)
+
     if prometheus_enabled_in_stream then
         init_stream_metrics()
     end
@@ -377,6 +414,35 @@ function _M.http_log(conf, ctx)
                 vars.request_type, vars.request_llm_model, vars.llm_model,
                 unpack(extra_labels("llm_completion_tokens", ctx))))
     end
+
+    if ctx.ai_cache_status then
+        if ctx.ai_cache_status == "HIT-L1" then
+            metrics.ai_cache_hits:inc(1,
+                gen_arr(route_id, service_id, consumer_name, "l1",
+                    unpack(extra_labels("ai_cache_hits", ctx))))
+        elseif ctx.ai_cache_status == "HIT-L2" then
+            metrics.ai_cache_hits:inc(1,
+                gen_arr(route_id, service_id, consumer_name, "l2",
+                    unpack(extra_labels("ai_cache_hits", ctx))))
+        elseif ctx.ai_cache_status == "MISS" then
+            metrics.ai_cache_misses:inc(1,
+                gen_arr(route_id, service_id, consumer_name,
+                    unpack(extra_labels("ai_cache_misses", ctx))))
+        end
+
+        if ctx.ai_cache_embedding_latency_ms then
+            metrics.ai_cache_embedding_latency:observe(ctx.ai_cache_embedding_latency_ms,
+                gen_arr(route_id, service_id, consumer_name,
+                    ctx.ai_cache_embedding_provider or "",
+                    unpack(extra_labels("ai_cache_embedding_latency", ctx))))
+        end
+
+        if ctx.ai_cache_embedding_failed then
+            metrics.ai_cache_embedding_failures:inc(1,
+                gen_arr(route_id, service_id, consumer_name,
+                    unpack(extra_labels("ai_cache_embedding_failures", ctx))))
+        end
+    end
 end
 
 
@@ -790,6 +856,7 @@ function _M.dec_llm_active_connections(ctx)
     inc_llm_active_connections(ctx, -1)
 end
 
+
 function _M.get_prometheus()
     return prometheus
 end
diff --git a/conf/config.yaml.example b/conf/config.yaml.example
index ae7155a86b06..901774540d70 100644
--- a/conf/config.yaml.example
+++ b/conf/config.yaml.example
@@ -514,6 +514,7 @@ plugins:                           # plugin list (sorted by priority)
   - ai-prompt-template             # priority: 1071
   - ai-prompt-decorator            # priority: 1070
   - ai-prompt-guard                # priority: 1072
+  - ai-cache                       # priority: 1065
   - ai-rag                         # priority: 1060
   - ai-aws-content-moderation      # priority: 1050
   - ai-proxy-multi                 # priority: 1041
diff --git a/docs/en/latest/config.json b/docs/en/latest/config.json
index d24eacc3f8e9..c198826c7505 100644
--- a/docs/en/latest/config.json
+++ b/docs/en/latest/config.json
@@ -75,6 +75,7 @@
             "plugins/ai-proxy-multi",
             "plugins/ai-rate-limiting",
             "plugins/ai-prompt-guard",
+            "plugins/ai-cache",
             "plugins/ai-aws-content-moderation",
             "plugins/ai-aliyun-content-moderation",
             "plugins/ai-prompt-decorator",
diff --git a/docs/en/latest/plugins/ai-cache.md b/docs/en/latest/plugins/ai-cache.md
new file mode 100644
index 000000000000..523b727f836f
--- /dev/null
+++ b/docs/en/latest/plugins/ai-cache.md
@@ -0,0 +1,1194 @@
+---
+title: ai-cache
+keywords:
+  - Apache APISIX
+  - API Gateway
+  - Plugin
+  - ai-cache
+description: The ai-cache Plugin caches LLM responses in Redis so identical or semantically similar prompts are served from cache, reducing latency and upstream cost.
+---
+
+<!--
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+-->
+
+<head>
+  <link rel="canonical" href="https://docs.api7.ai/hub/ai-cache" />
+</head>
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+## Description
+
+The `ai-cache` Plugin caches LLM responses in Redis so identical or semantically similar prompts are served from cache instead of incurring another upstream call. It supports two cache layers: an exact-match layer (`exact`) keyed by a hash of the prompt, and a semantic layer (`semantic`) that compares prompt embeddings via Redis Stack vector search. Either layer can be enabled independently, and when both are enabled a hit on the semantic layer backfills the exact layer so subsequent identical prompts return immediately.
+
+The Plugin should be used together with [ai-proxy](./ai-proxy.md) or [ai-proxy-multi](./ai-proxy-multi.md) on the same Route. The semantic layer requires Redis Stack with the RediSearch module and an embedding provider (OpenAI or Azure OpenAI). PRs for additional embedding providers are welcomed.
+
+## Plugin Attributes
+
+| Name | Type | Required | Default | Valid values | Description |
+| --- | --- | --- | --- | --- | --- |
+| `layers` | array[string] | False | `["exact", "semantic"]` | `"exact"`, `"semantic"` | Cache layers to enable, queried in order. |
+| `exact.ttl` | integer | False | `3600` | ≥ 1 | Time-to-live in seconds for exact-layer entries. |
+| `semantic.similarity_threshold` | number | False | `0.95` | 0–1 | Minimum cosine similarity required for a semantic-layer hit. |
+| `semantic.top_k` | integer | False | `1` | [1, 100] | Number of nearest-neighbor candidates the index returns; the first candidate above `similarity_threshold` is used. |
+| `semantic.ttl` | integer | False | `86400` | ≥ 1 | Time-to-live in seconds for semantic-layer entries. |
+| `semantic.embedding.provider` | string | True (if semantic enabled) | | `"openai"`, `"azure_openai"` | Embedding API provider. |
+| `semantic.embedding.endpoint` | string | True (if semantic enabled) | | | Embedding API endpoint URL. |
+| `semantic.embedding.api_key` | string | True (if semantic enabled) | | | API key for the embedding provider. Stored encrypted. |
+| `semantic.embedding.model` | string | False | | | Embedding model name. Sent in the request body for `provider: openai`; ignored for `provider: azure_openai` (Azure infers the model from the deployment URL). Uses provider default if omitted. |
+| `semantic.embedding.timeout` | integer | False | `5000` | [1, 600000] | HTTP request timeout in milliseconds for embedding API calls. |
+| `semantic.embedding.ssl_verify` | boolean | False | `true` | | Whether to verify the embedding endpoint's TLS certificate. |
+| `cache_key.include_consumer` | boolean | False | `false` | | If `true`, partition the cache by consumer name. |
+| `cache_key.include_vars` | array[string] | False | `[]` | | Additional `ctx.var` names included in the cache key, for example `["$http_x_tenant_id"]`. |
+| `bypass_on` | array[object] | False | | | List of `{header, equals}` rules. If any matches, the request bypasses the cache. |
+| `max_cache_body_size` | integer | False | `1048576` | ≥ 1 | Maximum response size in bytes to write to cache. Larger responses pass through but are not cached. |
+| `headers.cache_status` | string | False | `"X-AI-Cache-Status"` | | Response header for cache status (`HIT-L1`, `HIT-L2`, `MISS`, `BYPASS`). |
+| `headers.cache_age` | string | False | `"X-AI-Cache-Age"` | | Response header for the age in seconds of an exact-layer hit. |
+| `headers.cache_similarity` | string | False | `"X-AI-Cache-Similarity"` | | Response header for the similarity score of a semantic-layer hit. |
+| `redis_host` | string | True | | | The address of the Redis node. |
+| `redis_port` | integer | False | `6379` | [1,...] | The port of the Redis node. |
+| `redis_username` | string | False | | | The username for Redis if Redis ACL is used. If you use the legacy authentication method `requirepass`, configure only the `redis_password`. |
+| `redis_password` | string | False | | | The password of the Redis node. |
+| `redis_database` | integer | False | `0` | >= 0 | The database number in Redis. |
+| `redis_timeout` | integer | False | `1000` | [1,...] | The Redis timeout value in milliseconds. |
+| `redis_ssl` | boolean | False | `false` | | If `true`, use SSL to connect to Redis. |
+| `redis_ssl_verify` | boolean | False | `false` | | If `true`, verify the server SSL certificate. |
+| `redis_keepalive_timeout` | integer | False | `10000` | [1000,...] | Idle timeout in milliseconds for the Redis connection in the keepalive pool. |
+| `redis_keepalive_pool` | integer | False | `100` | [1,...] | Maximum number of idle Redis connections kept in the keepalive pool. |
+
+## Examples
+
+The following examples use OpenAI as the Upstream service provider. Before proceeding, create an [OpenAI account](https://openai.com) and an [API key](https://openai.com/blog/openai-api). You can optionally save the key to an environment variable:
+
+```shell
+export OPENAI_API_KEY=<YOUR_OPENAI_API_KEY>
+```
+
+If you are working with other LLM providers, please refer to the provider's documentation to obtain an API key.
+
+:::note
+
+You can fetch the `admin_key` from `config.yaml` and save to an environment variable with the following command:
+
+```shell
+admin_key=$(yq '.deployment.admin.admin_key[0].key' conf/config.yaml | sed 's/"//g')
+```
+
+:::
+
+### Cache Identical Prompts with the Exact Layer
+
+The following example demonstrates how to use the `ai-cache` Plugin with the exact layer only, so that identical prompts are returned from cache.
+
+<Tabs groupId="api">
+<TabItem value="admin-api" label="Admin API">
+
+Create a Route that uses [ai-proxy](./ai-proxy.md) to proxy to OpenAI and `ai-cache` to cache exact-match prompts:
+
+```shell
+curl "http://127.0.0.1:9180/apisix/admin/routes/1" -X PUT \
+  -H "X-API-KEY: ${admin_key}" \
+  -d '{
+    "uri": "/anything",
+    "methods": ["POST"],
+    "plugins": {
+      "ai-proxy": {
+        "provider": "openai",
+        "auth": {
+          "header": {
+            "Authorization": "Bearer '"$OPENAI_API_KEY"'"
+          }
+        },
+        "options": {
+          "model": "gpt-4"
+        }
+      },
+      "ai-cache": {
+        "layers": ["exact"],
+        "exact": { "ttl": 3600 },
+        "redis_host": "127.0.0.1"
+      }
+    }
+  }'
+```
+
+</TabItem>
+<TabItem value="adc" label="ADC">
+
+Create a Route with the `ai-cache` and [ai-proxy](./ai-proxy.md) Plugins configured as such:
+
+```yaml title="adc.yaml"
+services:
+  - name: ai-cache-service
+    routes:
+      - name: ai-cache-route
+        uris:
+          - /anything
+        methods:
+          - POST
+        plugins:
+          ai-proxy:
+            provider: openai
+            auth:
+              header:
+                Authorization: "Bearer ${OPENAI_API_KEY}"
+            options:
+              model: gpt-4
+          ai-cache:
+            layers:
+              - exact
+            exact:
+              ttl: 3600
+            redis_host: 127.0.0.1
+```
+
+Synchronize the configuration to the gateway:
+
+```shell
+adc sync -f adc.yaml
+```
+
+</TabItem>
+<TabItem value="ingress" label="Ingress Controller">
+
+<Tabs groupId="k8s-api">
+<TabItem value="gateway-api" label="Gateway API">
+
+Create a Route with the `ai-cache` and [ai-proxy](./ai-proxy.md) Plugins configured as such:
+
+```yaml title="ai-cache-ic.yaml"
+apiVersion: apisix.apache.org/v1alpha1
+kind: PluginConfig
+metadata:
+  namespace: aic
+  name: ai-cache-plugin-config
+spec:
+  plugins:
+    - name: ai-proxy
+      config:
+        provider: openai
+        auth:
+          header:
+            Authorization: "Bearer your-api-key"
+        options:
+          model: gpt-4
+    - name: ai-cache
+      config:
+        layers:
+          - exact
+        exact:
+          ttl: 3600
+        redis_host: redis-stack
+---
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+  namespace: aic
+  name: ai-cache-route
+spec:
+  parentRefs:
+    - name: apisix
+  rules:
+    - matches:
+        - path:
+            type: Exact
+            value: /anything
+          method: POST
+      filters:
+        - type: ExtensionRef
+          extensionRef:
+            group: apisix.apache.org
+            kind: PluginConfig
+            name: ai-cache-plugin-config
+```
+
+</TabItem>
+<TabItem value="ingress" label="APISIX Ingress Controller">
+
+Create a Route with the `ai-cache` and [ai-proxy](./ai-proxy.md) Plugins configured as such:
+
+```yaml title="ai-cache-ic.yaml"
+apiVersion: apisix.apache.org/v2
+kind: ApisixRoute
+metadata:
+  namespace: aic
+  name: ai-cache-route
+spec:
+  ingressClassName: apisix
+  http:
+    - name: ai-cache-route
+      match:
+        paths:
+          - /anything
+        methods:
+          - POST
+      plugins:
+        - name: ai-proxy
+          enable: true
+          config:
+            provider: openai
+            auth:
+              header:
+                Authorization: "Bearer your-api-key"
+            options:
+              model: gpt-4
+        - name: ai-cache
+          enable: true
+          config:
+            layers:
+              - exact
+            exact:
+              ttl: 3600
+            redis_host: redis-stack
+```
+
+</TabItem>
+</Tabs>
+
+Apply the configuration to your cluster:
+
+```shell
+kubectl apply -f ai-cache-ic.yaml
+```
+
+</TabItem>
+</Tabs>
+
+Send a request to the Route:
+
+```shell
+curl -i "http://127.0.0.1:9080/anything" -X POST \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messages": [
+      { "role": "user", "content": "What is the capital of France?" }
+    ]
+  }'
+```
+
+The first request reaches OpenAI. Note the `X-AI-Cache-Status: MISS` header, indicating the prompt was not in cache and APISIX forwarded the request upstream:
+
+```text
+HTTP/1.1 200 OK
+Content-Type: application/json
+Server: APISIX/3.16.0
+X-AI-Cache-Status: MISS
+
+{
+  "id": "chatcmpl-Da7Iqsqz9gc8Mkf07Hn4NCzAH5Ri1",
+  "object": "chat.completion",
+  "created": 1777500252,
+  "model": "gpt-4o-mini-2024-07-18",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "The capital of France is Paris.",
+        "refusal": null
+      },
+      "finish_reason": "stop"
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 14,
+    "completion_tokens": 7,
+    "total_tokens": 21
+  },
+  "system_fingerprint": "fp_d3214ccada"
+}
+```
+
+Send the same request again:
+
+```shell
+curl -i "http://127.0.0.1:9080/anything" -X POST \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messages": [
+      { "role": "user", "content": "What is the capital of France?" }
+    ]
+  }'
+```
+
+The second request returns from cache without contacting OpenAI. The `X-AI-Cache-Status: HIT-L1` header signals an exact-match hit and `X-AI-Cache-Age` reports the entry's age in seconds. The cached response is replayed from Redis, so the body is shorter and does not contain the original `created`, `model`, `usage`, or `system_fingerprint` fields:
+
+```text
+HTTP/1.1 200 OK
+Content-Type: application/json
+Server: APISIX/3.16.0
+X-AI-Cache-Status: HIT-L1
+X-AI-Cache-Age: 4
+
+{
+  "id": "f558665e-3a03-42e3-9aa9-f54c402927c0",
+  "object": "chat.completion",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "content": "The capital of France is Paris.",
+        "role": "assistant"
+      },
+      "finish_reason": "stop"
+    }
+  ]
+}
+```
+
+### Cache Paraphrased Prompts with the Semantic Layer
+
+The following example demonstrates how to enable the semantic layer so that prompts with different wording but similar meaning are served from cache.
+
+<Tabs groupId="api">
+<TabItem value="admin-api" label="Admin API">
+
+```shell
+curl "http://127.0.0.1:9180/apisix/admin/routes/1" -X PUT \
+  -H "X-API-KEY: ${admin_key}" \
+  -d '{
+    "uri": "/anything",
+    "methods": ["POST"],
+    "plugins": {
+      "ai-proxy": {
+        "provider": "openai",
+        "auth": {
+          "header": {
+            "Authorization": "Bearer '"$OPENAI_API_KEY"'"
+          }
+        },
+        "options": {
+          "model": "gpt-4"
+        }
+      },
+      "ai-cache": {
+        "layers": ["exact", "semantic"],
+        "exact": { "ttl": 3600 },
+        "semantic": {
+          "similarity_threshold": 0.85,
+          "ttl": 86400,
+          "embedding": {
+            "provider": "openai",
+            "endpoint": "https://api.openai.com/v1/embeddings",
+            "api_key": "'"$OPENAI_API_KEY"'",
+            "model": "text-embedding-3-small"
+          }
+        },
+        "redis_host": "127.0.0.1"
+      }
+    }
+  }'
+```
+
+</TabItem>
+<TabItem value="adc" label="ADC">
+
+```yaml title="adc.yaml"
+services:
+  - name: ai-cache-service
+    routes:
+      - name: ai-cache-route
+        uris:
+          - /anything
+        methods:
+          - POST
+        plugins:
+          ai-proxy:
+            provider: openai
+            auth:
+              header:
+                Authorization: "Bearer ${OPENAI_API_KEY}"
+            options:
+              model: gpt-4
+          ai-cache:
+            layers:
+              - exact
+              - semantic
+            exact:
+              ttl: 3600
+            semantic:
+              similarity_threshold: 0.85
+              ttl: 86400
+              embedding:
+                provider: openai
+                endpoint: https://api.openai.com/v1/embeddings
+                api_key: "${OPENAI_API_KEY}"
+                model: text-embedding-3-small
+            redis_host: 127.0.0.1
+```
+
+Synchronize the configuration to the gateway:
+
+```shell
+adc sync -f adc.yaml
+```
+
+</TabItem>
+<TabItem value="ingress" label="Ingress Controller">
+
+<Tabs groupId="k8s-api">
+<TabItem value="gateway-api" label="Gateway API">
+
+```yaml title="ai-cache-ic.yaml"
+apiVersion: apisix.apache.org/v1alpha1
+kind: PluginConfig
+metadata:
+  namespace: aic
+  name: ai-cache-plugin-config
+spec:
+  plugins:
+    - name: ai-proxy
+      config:
+        provider: openai
+        auth:
+          header:
+            Authorization: "Bearer your-api-key"
+        options:
+          model: gpt-4
+    - name: ai-cache
+      config:
+        layers:
+          - exact
+          - semantic
+        exact:
+          ttl: 3600
+        semantic:
+          similarity_threshold: 0.85
+          ttl: 86400
+          embedding:
+            provider: openai
+            endpoint: https://api.openai.com/v1/embeddings
+            api_key: your-api-key
+            model: text-embedding-3-small
+        redis_host: redis-stack
+---
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+  namespace: aic
+  name: ai-cache-route
+spec:
+  parentRefs:
+    - name: apisix
+  rules:
+    - matches:
+        - path:
+            type: Exact
+            value: /anything
+          method: POST
+      filters:
+        - type: ExtensionRef
+          extensionRef:
+            group: apisix.apache.org
+            kind: PluginConfig
+            name: ai-cache-plugin-config
+```
+
+</TabItem>
+<TabItem value="ingress" label="APISIX Ingress Controller">
+
+```yaml title="ai-cache-ic.yaml"
+apiVersion: apisix.apache.org/v2
+kind: ApisixRoute
+metadata:
+  namespace: aic
+  name: ai-cache-route
+spec:
+  ingressClassName: apisix
+  http:
+    - name: ai-cache-route
+      match:
+        paths:
+          - /anything
+        methods:
+          - POST
+      plugins:
+        - name: ai-proxy
+          enable: true
+          config:
+            provider: openai
+            auth:
+              header:
+                Authorization: "Bearer your-api-key"
+            options:
+              model: gpt-4
+        - name: ai-cache
+          enable: true
+          config:
+            layers:
+              - exact
+              - semantic
+            exact:
+              ttl: 3600
+            semantic:
+              similarity_threshold: 0.85
+              ttl: 86400
+              embedding:
+                provider: openai
+                endpoint: https://api.openai.com/v1/embeddings
+                api_key: your-api-key
+                model: text-embedding-3-small
+            redis_host: redis-stack
+```
+
+</TabItem>
+</Tabs>
+
+Apply the configuration to your cluster:
+
+```shell
+kubectl apply -f ai-cache-ic.yaml
+```
+
+</TabItem>
+</Tabs>
+
+Send a first request:
+
+```shell
+curl -i "http://127.0.0.1:9080/anything" -X POST \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messages": [
+      { "role": "user", "content": "What is the capital city of China?" }
+    ]
+  }'
+```
+
+The first request reaches OpenAI with `X-AI-Cache-Status: MISS`:
+
+```text
+HTTP/1.1 200 OK
+Content-Type: application/json
+Server: APISIX/3.16.0
+X-AI-Cache-Status: MISS
+
+{
+  "id": "chatcmpl-DcCIDs6ZJisclo84FUk5fT2Ks5vzn",
+  "object": "chat.completion",
+  "model": "gpt-4-0613",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "The capital city of China is Beijing."
+      },
+      "finish_reason": "stop"
+    }
+  ],
+  "usage": { "prompt_tokens": 15, "completion_tokens": 8, "total_tokens": 23 }
+}
+```
+
+Wait a couple of seconds for the semantic-layer write to complete in the background, then send a second request with paraphrased wording:
+
+```shell
+curl -i "http://127.0.0.1:9080/anything" -X POST \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messages": [
+      { "role": "user", "content": "Capital city of China?" }
+    ]
+  }'
+```
+
+The semantic layer matches the embedding (cosine similarity above the threshold) and returns the cached response without contacting OpenAI. The `X-AI-Cache-Status: HIT-L2` header signals a semantic-layer hit and `X-AI-Cache-Similarity` reports the cosine similarity score:
+
+```text
+HTTP/1.1 200 OK
+Content-Type: application/json
+Server: APISIX/3.16.0
+X-AI-Cache-Status: HIT-L2
+X-AI-Cache-Similarity: 0.9065774679184
+
+{
+  "id": "a95488bb-4a51-491a-bd5b-2c1d0e5f8a9b",
+  "object": "chat.completion",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "content": "The capital city of China is Beijing.",
+        "role": "assistant"
+      },
+      "finish_reason": "stop"
+    }
+  ]
+}
+```
+
+When the `exact` layer is also enabled (as in this example), a semantic-layer hit backfills it, so an immediate retry of the same paraphrase returns `X-AI-Cache-Status: HIT-L1`.
+
+### Isolate Cache Entries Per Consumer or Tenant
+
+The following example demonstrates how to namespace cache entries so that one consumer's response is not served to another. Use `cache_key.include_consumer` to partition by consumer name, or `cache_key.include_vars` to include request variables such as a tenant header.
+
+<Tabs groupId="api">
+<TabItem value="admin-api" label="Admin API">
+
+```shell
+curl "http://127.0.0.1:9180/apisix/admin/routes/1" -X PUT \
+  -H "X-API-KEY: ${admin_key}" \
+  -d '{
+    "uri": "/anything",
+    "methods": ["POST"],
+    "plugins": {
+      "ai-proxy": {
+        "provider": "openai",
+        "auth": {
+          "header": {
+            "Authorization": "Bearer '"$OPENAI_API_KEY"'"
+          }
+        },
+        "options": {
+          "model": "gpt-4"
+        }
+      },
+      "ai-cache": {
+        "layers": ["exact"],
+        "exact": { "ttl": 3600 },
+        "cache_key": {
+          "include_consumer": true,
+          "include_vars": ["$http_x_tenant_id"]
+        },
+        "redis_host": "127.0.0.1"
+      }
+    }
+  }'
+```
+
+</TabItem>
+<TabItem value="adc" label="ADC">
+
+```yaml title="adc.yaml"
+services:
+  - name: ai-cache-service
+    routes:
+      - name: ai-cache-route
+        uris:
+          - /anything
+        methods:
+          - POST
+        plugins:
+          ai-proxy:
+            provider: openai
+            auth:
+              header:
+                Authorization: "Bearer ${OPENAI_API_KEY}"
+            options:
+              model: gpt-4
+          ai-cache:
+            layers:
+              - exact
+            exact:
+              ttl: 3600
+            cache_key:
+              include_consumer: true
+              include_vars:
+                - "$http_x_tenant_id"
+            redis_host: 127.0.0.1
+```
+
+Synchronize the configuration to the gateway:
+
+```shell
+adc sync -f adc.yaml
+```
+
+</TabItem>
+<TabItem value="ingress" label="Ingress Controller">
+
+<Tabs groupId="k8s-api">
+<TabItem value="gateway-api" label="Gateway API">
+
+```yaml title="ai-cache-ic.yaml"
+apiVersion: apisix.apache.org/v1alpha1
+kind: PluginConfig
+metadata:
+  namespace: aic
+  name: ai-cache-plugin-config
+spec:
+  plugins:
+    - name: ai-proxy
+      config:
+        provider: openai
+        auth:
+          header:
+            Authorization: "Bearer your-api-key"
+        options:
+          model: gpt-4
+    - name: ai-cache
+      config:
+        layers:
+          - exact
+        exact:
+          ttl: 3600
+        cache_key:
+          include_consumer: true
+          include_vars:
+            - "$http_x_tenant_id"
+        redis_host: redis-stack
+---
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+  namespace: aic
+  name: ai-cache-route
+spec:
+  parentRefs:
+    - name: apisix
+  rules:
+    - matches:
+        - path:
+            type: Exact
+            value: /anything
+          method: POST
+      filters:
+        - type: ExtensionRef
+          extensionRef:
+            group: apisix.apache.org
+            kind: PluginConfig
+            name: ai-cache-plugin-config
+```
+
+</TabItem>
+<TabItem value="ingress" label="APISIX Ingress Controller">
+
+```yaml title="ai-cache-ic.yaml"
+apiVersion: apisix.apache.org/v2
+kind: ApisixRoute
+metadata:
+  namespace: aic
+  name: ai-cache-route
+spec:
+  ingressClassName: apisix
+  http:
+    - name: ai-cache-route
+      match:
+        paths:
+          - /anything
+        methods:
+          - POST
+      plugins:
+        - name: ai-proxy
+          enable: true
+          config:
+            provider: openai
+            auth:
+              header:
+                Authorization: "Bearer your-api-key"
+            options:
+              model: gpt-4
+        - name: ai-cache
+          enable: true
+          config:
+            layers:
+              - exact
+            exact:
+              ttl: 3600
+            cache_key:
+              include_consumer: true
+              include_vars:
+                - "$http_x_tenant_id"
+            redis_host: redis-stack
+```
+
+</TabItem>
+</Tabs>
+
+Apply the configuration to your cluster:
+
+```shell
+kubectl apply -f ai-cache-ic.yaml
+```
+
+</TabItem>
+</Tabs>
+
+Send a first request as `tenant-a`:
+
+```shell
+curl -i "http://127.0.0.1:9080/anything" -X POST \
+  -H "Content-Type: application/json" \
+  -H "X-Tenant-Id: tenant-a" \
+  -d '{
+    "messages": [
+      { "role": "user", "content": "What is the capital city of Japan?" }
+    ]
+  }'
+```
+
+The first request reaches OpenAI with `X-AI-Cache-Status: MISS` and primes `tenant-a`'s cache scope:
+
+```text
+HTTP/1.1 200 OK
+Content-Type: application/json
+Server: APISIX/3.16.0
+X-AI-Cache-Status: MISS
+
+{
+  "id": "chatcmpl-DcCRAzeSsimIOIeLQWsKtDxMLAAhu",
+  "object": "chat.completion",
+  "model": "gpt-4-0613",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "The capital city of Japan is Tokyo."
+      },
+      "finish_reason": "stop"
+    }
+  ],
+  "usage": { "prompt_tokens": 15, "completion_tokens": 8, "total_tokens": 23 }
+}
+```
+
+Repeat the same prompt as `tenant-a`:
+
+```shell
+curl -i "http://127.0.0.1:9080/anything" -X POST \
+  -H "Content-Type: application/json" \
+  -H "X-Tenant-Id: tenant-a" \
+  -d '{
+    "messages": [
+      { "role": "user", "content": "What is the capital city of Japan?" }
+    ]
+  }'
+```
+
+The second request returns from cache without contacting OpenAI. The `X-AI-Cache-Status: HIT-L1` header confirms `tenant-a`'s entry was reused:
+
+```text
+HTTP/1.1 200 OK
+Content-Type: application/json
+Server: APISIX/3.16.0
+X-AI-Cache-Status: HIT-L1
+X-AI-Cache-Age: 6
+
+{
+  "id": "6be4f7a2-83f1-4cdc-8654-cee0396bd4f3",
+  "object": "chat.completion",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "content": "The capital city of Japan is Tokyo.",
+        "role": "assistant"
+      },
+      "finish_reason": "stop"
+    }
+  ]
+}
+```
+
+Send the same prompt as a different tenant, `tenant-b`:
+
+```shell
+curl -i "http://127.0.0.1:9080/anything" -X POST \
+  -H "Content-Type: application/json" \
+  -H "X-Tenant-Id: tenant-b" \
+  -d '{
+    "messages": [
+      { "role": "user", "content": "What is the capital city of Japan?" }
+    ]
+  }'
+```
+
+Even though the prompt is identical, the request reaches OpenAI with `X-AI-Cache-Status: MISS` because `tenant-b` has its own cache scope:
+
+```text
+HTTP/1.1 200 OK
+Content-Type: application/json
+Server: APISIX/3.16.0
+X-AI-Cache-Status: MISS
+
+{
+  "id": "chatcmpl-DcCROH92JLWcgyhSpwEoutTvqnew5",
+  "object": "chat.completion",
+  "model": "gpt-4-0613",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "The capital city of Japan is Tokyo."
+      },
+      "finish_reason": "stop"
+    }
+  ],
+  "usage": { "prompt_tokens": 15, "completion_tokens": 8, "total_tokens": 23 }
+}
+```
+
+### Bypass the Cache on a Header
+
+The following example demonstrates how to skip the cache entirely when a request carries a specific header, for example to refresh a cached response or to support staff debugging.
+
+<Tabs groupId="api">
+<TabItem value="admin-api" label="Admin API">
+
+```shell
+curl "http://127.0.0.1:9180/apisix/admin/routes/1" -X PUT \
+  -H "X-API-KEY: ${admin_key}" \
+  -d '{
+    "uri": "/anything",
+    "methods": ["POST"],
+    "plugins": {
+      "ai-proxy": {
+        "provider": "openai",
+        "auth": {
+          "header": {
+            "Authorization": "Bearer '"$OPENAI_API_KEY"'"
+          }
+        },
+        "options": {
+          "model": "gpt-4"
+        }
+      },
+      "ai-cache": {
+        "layers": ["exact"],
+        "exact": { "ttl": 3600 },
+        "bypass_on": [
+          { "header": "X-Cache-Bypass", "equals": "1" }
+        ],
+        "redis_host": "127.0.0.1"
+      }
+    }
+  }'
+```
+
+</TabItem>
+<TabItem value="adc" label="ADC">
+
+```yaml title="adc.yaml"
+services:
+  - name: ai-cache-service
+    routes:
+      - name: ai-cache-route
+        uris:
+          - /anything
+        methods:
+          - POST
+        plugins:
+          ai-proxy:
+            provider: openai
+            auth:
+              header:
+                Authorization: "Bearer ${OPENAI_API_KEY}"
+            options:
+              model: gpt-4
+          ai-cache:
+            layers:
+              - exact
+            exact:
+              ttl: 3600
+            bypass_on:
+              - header: X-Cache-Bypass
+                equals: "1"
+            redis_host: 127.0.0.1
+```
+
+Synchronize the configuration to the gateway:
+
+```shell
+adc sync -f adc.yaml
+```
+
+</TabItem>
+<TabItem value="ingress" label="Ingress Controller">
+
+<Tabs groupId="k8s-api">
+<TabItem value="gateway-api" label="Gateway API">
+
+```yaml title="ai-cache-ic.yaml"
+apiVersion: apisix.apache.org/v1alpha1
+kind: PluginConfig
+metadata:
+  namespace: aic
+  name: ai-cache-plugin-config
+spec:
+  plugins:
+    - name: ai-proxy
+      config:
+        provider: openai
+        auth:
+          header:
+            Authorization: "Bearer your-api-key"
+        options:
+          model: gpt-4
+    - name: ai-cache
+      config:
+        layers:
+          - exact
+        exact:
+          ttl: 3600
+        bypass_on:
+          - header: X-Cache-Bypass
+            equals: "1"
+        redis_host: redis-stack
+---
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+  namespace: aic
+  name: ai-cache-route
+spec:
+  parentRefs:
+    - name: apisix
+  rules:
+    - matches:
+        - path:
+            type: Exact
+            value: /anything
+          method: POST
+      filters:
+        - type: ExtensionRef
+          extensionRef:
+            group: apisix.apache.org
+            kind: PluginConfig
+            name: ai-cache-plugin-config
+```
+
+</TabItem>
+<TabItem value="ingress" label="APISIX Ingress Controller">
+
+```yaml title="ai-cache-ic.yaml"
+apiVersion: apisix.apache.org/v2
+kind: ApisixRoute
+metadata:
+  namespace: aic
+  name: ai-cache-route
+spec:
+  ingressClassName: apisix
+  http:
+    - name: ai-cache-route
+      match:
+        paths:
+          - /anything
+        methods:
+          - POST
+      plugins:
+        - name: ai-proxy
+          enable: true
+          config:
+            provider: openai
+            auth:
+              header:
+                Authorization: "Bearer your-api-key"
+            options:
+              model: gpt-4
+        - name: ai-cache
+          enable: true
+          config:
+            layers:
+              - exact
+            exact:
+              ttl: 3600
+            bypass_on:
+              - header: X-Cache-Bypass
+                equals: "1"
+            redis_host: redis-stack
+```
+
+</TabItem>
+</Tabs>
+
+Apply the configuration to your cluster:
+
+```shell
+kubectl apply -f ai-cache-ic.yaml
+```
+
+</TabItem>
+</Tabs>
+
+Send a request with the bypass header:
+
+```shell
+curl -i "http://127.0.0.1:9080/anything" -X POST \
+  -H "Content-Type: application/json" \
+  -H "X-Cache-Bypass: 1" \
+  -d '{
+    "messages": [
+      { "role": "user", "content": "What is the capital of France?" }
+    ]
+  }'
+```
+
+The request reaches OpenAI even though a cached entry exists, and the response is not written back to cache. The `X-AI-Cache-Status: BYPASS` header confirms the cache was skipped, and the response includes the original `created`, `model`, `usage`, and `system_fingerprint` fields, verifying the upstream was contacted:
+
+```text
+HTTP/1.1 200 OK
+Content-Type: application/json
+Server: APISIX/3.16.0
+X-AI-Cache-Status: BYPASS
+
+{
+  "id": "chatcmpl-Da7N4E9fA6KoQ7av98hL0zxplPCcD",
+  "object": "chat.completion",
+  "created": 1777500514,
+  "model": "gpt-4o-mini-2024-07-18",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "The capital of France is Paris.",
+        "refusal": null
+      },
+      "finish_reason": "stop"
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 14,
+    "completion_tokens": 7,
+    "total_tokens": 21
+  },
+  "system_fingerprint": "fp_d3214ccada"
+}
+```
+
+## Caveats
+
+### The semantic-layer write is asynchronous
+
+After a `MISS`, the embedding fetch and Redis vector store happen in a background timer. If you send a paraphrased prompt immediately after the first request, you may see another `MISS` because the entry has not been stored yet. Wait a couple of seconds before sending a paraphrase to verify a semantic hit.
+
+### Similarity is mathematical, not human-judged
+
+Two prompts that look semantically equivalent to a human can score below the configured `similarity_threshold` and therefore miss the cache. Conversely, a small wording change can flip the result. For example, with `similarity_threshold` set to `0.85` and the cache primed with `"What is the capital of France?"`:
+
+| Prompt | Status | Similarity |
+|--------|--------|------------|
+| `capital of France?` | `HIT-L2` | `0.850` |
+| `capital of France what?` | `MISS` | (below threshold) |
+| `capital of France what is?` | `HIT-L2` | `0.972` |
+| `capital of France what please?` | `HIT-L2` | `0.924` |
+| `capital of France what is please tell me?` | `MISS` | (below threshold) |
+
+Lower the threshold to catch more paraphrases at the cost of occasionally serving a cached answer for a genuinely different question. Tune empirically against your traffic.
+
+### `BYPASS` does not refresh the cache
+
+A request with the bypass header reaches the upstream but its response is not written back. Use it to force a fresh upstream call without invalidating or replacing the existing cached entry.
+
+The bypass header is not authenticated — any client that can set the configured header and value can bypass the cache. In production, gate access using an APISIX authentication plugin such as `key-auth` or `ip-restriction`, or restrict the header at your upstream WAF.
diff --git a/t/admin/plugins.t b/t/admin/plugins.t
index adb98b28bc17..1454ec145eb0 100644
--- a/t/admin/plugins.t
+++ b/t/admin/plugins.t
@@ -98,6 +98,7 @@ ai-request-rewrite
 ai-prompt-guard
 ai-prompt-template
 ai-prompt-decorator
+ai-cache
 ai-rag
 ai-aws-content-moderation
 ai-proxy-multi
diff --git a/t/plugin/ai-cache-scope.t b/t/plugin/ai-cache-scope.t
new file mode 100644
index 000000000000..21facdcf4688
--- /dev/null
+++ b/t/plugin/ai-cache-scope.t
@@ -0,0 +1,384 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+BEGIN {
+    $ENV{TEST_ENABLE_CONTROL_API_V1} = "0";
+}
+
+use t::APISIX 'no_plan';
+
+log_level("info");
+repeat_each(1);
+no_long_string();
+no_shuffle();
+no_root_location();
+
+add_block_preprocessor(sub {
+    my ($block) = @_;
+
+    if (!defined $block->request) {
+        $block->set_value("request", "GET /t");
+    }
+
+    if (!$block->error_log && !$block->no_error_log) {
+        $block->set_value("no_error_log", "[error]\n[alert]");
+    }
+
+    if (!defined $block->http_config) {
+        $block->set_value("http_config", <<_EOC_);
+server {
+    listen 1990;
+    default_type 'application/json';
+
+    location /v1/embeddings {
+        content_by_lua_block {
+            local fixture_loader = require("lib.fixture_loader")
+            local content, err = fixture_loader.load("openai/embeddings-list.json")
+            if not content then
+                ngx.status = 500
+                ngx.say(err)
+                return
+            end
+
+            ngx.status = 200
+            ngx.print(content)
+        }
+    }
+}
+_EOC_
+    }
+});
+
+run_tests();
+
+__DATA__
+
+=== TEST 1: set up route with cache_key include_vars
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/1',
+                ngx.HTTP_PUT,
+                [[{
+                    "uri": "/scoped",
+                    "plugins": {
+                        "ai-proxy": {
+                            "provider": "openai",
+                            "auth": {
+                                "header": {
+                                    "Authorization": "Bearer test-key"
+                                }
+                            },
+                            "override": {
+                                "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+                            }
+                        },
+                        "ai-cache": {
+                            "layers": ["exact"],
+                            "exact": { "ttl": 60 },
+                            "cache_key": {
+                                "include_vars": ["$http_x_tenant_id"]
+                            },
+                            "redis_host": "127.0.0.1"
+                        }
+                    }
+                }]]
+            )
+
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.say(body)
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 2: tenant-a first request - MISS
+--- request
+POST /scoped
+{"messages":[{"role":"user","content":"scope test prompt"}]}
+--- more_headers
+Content-Type: application/json
+X-Tenant-Id: tenant-a
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+
+
+
+=== TEST 3: tenant-b same prompt - MISS (proves cache_key partitioning)
+--- request
+POST /scoped
+{"messages":[{"role":"user","content":"scope test prompt"}]}
+--- more_headers
+Content-Type: application/json
+X-Tenant-Id: tenant-b
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- wait: 1
+
+
+
+=== TEST 4: tenant-a same prompt again - HIT-L1
+--- request
+POST /scoped
+{"messages":[{"role":"user","content":"scope test prompt"}]}
+--- more_headers
+Content-Type: application/json
+X-Tenant-Id: tenant-a
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L1
+
+
+
+=== TEST 5: set up consumers for include_consumer test
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+
+            local consumers = {
+                { username = "alice", key = "alice-key" },
+                { username = "bob",   key = "bob-key"   },
+            }
+
+            for _, c in ipairs(consumers) do
+                local code, body = t('/apisix/admin/consumers',
+                    ngx.HTTP_PUT,
+                    string.format([[{
+                        "username": "%s",
+                        "plugins": { "key-auth": { "key": "%s" } }
+                    }]], c.username, c.key)
+                )
+                if code >= 300 then
+                    ngx.status = code
+                    ngx.say(body)
+                    return
+                end
+            end
+            ngx.say("passed")
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 6: set up route with cache_key include_consumer + key-auth
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/2',
+                ngx.HTTP_PUT,
+                [[{
+                    "uri": "/per-consumer",
+                    "plugins": {
+                        "key-auth": {},
+                        "ai-proxy": {
+                            "provider": "openai",
+                            "auth": {
+                                "header": {
+                                    "Authorization": "Bearer test-key"
+                                }
+                            },
+                            "override": {
+                                "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+                            }
+                        },
+                        "ai-cache": {
+                            "layers": ["exact"],
+                            "exact": { "ttl": 60 },
+                            "cache_key": {
+                                "include_consumer": true
+                            },
+                            "redis_host": "127.0.0.1"
+                        }
+                    }
+                }]]
+            )
+
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.say(body)
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 7: alice first request - MISS
+--- request
+POST /per-consumer
+{"messages":[{"role":"user","content":"per-consumer prompt"}]}
+--- more_headers
+Content-Type: application/json
+apikey: alice-key
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- wait: 1
+
+
+
+=== TEST 8: bob same prompt - MISS (proves include_consumer partitioning)
+--- request
+POST /per-consumer
+{"messages":[{"role":"user","content":"per-consumer prompt"}]}
+--- more_headers
+Content-Type: application/json
+apikey: bob-key
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- wait: 1
+
+
+
+=== TEST 9: bob same prompt again - HIT-L1 (proves bob has own cache)
+--- request
+POST /per-consumer
+{"messages":[{"role":"user","content":"per-consumer prompt"}]}
+--- more_headers
+Content-Type: application/json
+apikey: bob-key
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L1
+
+
+
+=== TEST 10: set up route with L2 semantic + cache_key include_vars
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/3',
+                ngx.HTTP_PUT,
+                [[{
+                    "uri": "/scoped-semantic",
+                    "plugins": {
+                        "ai-proxy": {
+                            "provider": "openai",
+                            "auth": {
+                                "header": {
+                                    "Authorization": "Bearer test-key"
+                                }
+                            },
+                            "override": {
+                                "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+                            }
+                        },
+                        "ai-cache": {
+                            "layers": ["exact", "semantic"],
+                            "exact": { "ttl": 60 },
+                            "semantic": {
+                                "similarity_threshold": 0.90,
+                                "ttl": 300,
+                                "embedding": {
+                                    "provider": "openai",
+                                    "endpoint": "http://127.0.0.1:1990/v1/embeddings",
+                                    "api_key": "test-key"
+                                }
+                            },
+                            "cache_key": {
+                                "include_vars": ["$http_x_tenant_id"]
+                            },
+                            "redis_host": "127.0.0.1"
+                        }
+                    }
+                }]]
+            )
+
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.say(body)
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 11: tenant-a first request - MISS, writes to L2 with scope=hash(tenant-a)
+--- request
+POST /scoped-semantic
+{"messages":[{"role":"user","content":"What is the capital of France??"}]}
+--- more_headers
+Content-Type: application/json
+X-Tenant-Id: tenant-a
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- wait: 1
+
+
+
+=== TEST 12: tenant-b same prompt - MISS (FT.SEARCH scope filter excludes tenant-a's entry)
+--- request
+POST /scoped-semantic
+{"messages":[{"role":"user","content":"What is the capital of France??"}]}
+--- more_headers
+Content-Type: application/json
+X-Tenant-Id: tenant-b
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- wait: 1
+
+
+
+=== TEST 13: tenant-a paraphrase - HIT-L2 (scope filter finds tenant-a's entry)
+--- request
+POST /scoped-semantic
+{"messages":[{"role":"user","content":"Name the capital city of France"}]}
+--- more_headers
+Content-Type: application/json
+X-Tenant-Id: tenant-a
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L2
+
+
+
+=== TEST 14: tenant-b paraphrase - HIT-L2 (proves tenant-b has own L2 entry)
+--- request
+POST /scoped-semantic
+{"messages":[{"role":"user","content":"Name the capital city of France"}]}
+--- more_headers
+Content-Type: application/json
+X-Tenant-Id: tenant-b
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L2
diff --git a/t/plugin/ai-cache.t b/t/plugin/ai-cache.t
new file mode 100644
index 000000000000..abc652328ac8
--- /dev/null
+++ b/t/plugin/ai-cache.t
@@ -0,0 +1,818 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+BEGIN {
+    $ENV{TEST_ENABLE_CONTROL_API_V1} = "0";
+}
+
+use t::APISIX 'no_plan';
+
+log_level("info");
+repeat_each(1);
+no_long_string();
+no_shuffle();
+no_root_location();
+
+add_block_preprocessor(sub {
+    my ($block) = @_;
+
+    if (!defined $block->request) {
+        $block->set_value("request", "GET /t");
+    }
+
+    if (!$block->error_log && !$block->no_error_log) {
+        $block->set_value("no_error_log", "[error]\n[alert]");
+    }
+
+    if (!defined $block->http_config) {
+        $block->set_value("http_config", <<_EOC_);
+server {
+    listen 1990;
+    default_type 'application/json';
+
+    location /v1/embeddings {
+        content_by_lua_block {
+            local fixture_loader = require("lib.fixture_loader")
+            local content, err = fixture_loader.load("openai/embeddings-list.json")
+            if not content then
+                ngx.status = 500
+                ngx.say(err)
+                return
+            end
+
+            ngx.status = 200
+            ngx.print(content)
+        }
+    }
+}
+_EOC_
+    }
+});
+
+run_tests();
+
+__DATA__
+
+=== TEST 1: valid config - exact layer only
+--- config
+    location /t {
+        content_by_lua_block {
+            local plugin = require("apisix.plugins.ai-cache")
+            local ok, err = plugin.check_schema({
+                layers = { "exact" },
+                exact = { ttl = 600 },
+                redis_host = "127.0.0.1",
+                redis_port = 6379,
+            })
+
+            if not ok then
+                ngx.say("failed")
+            else
+                ngx.say("passed")
+            end
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 2: valid config - both layers with semantic embedding
+--- config
+    location /t {
+        content_by_lua_block {
+            local plugin = require("apisix.plugins.ai-cache")
+            local ok, err = plugin.check_schema({
+                layers = { "exact", "semantic" },
+                exact = { ttl = 3600 },
+                semantic = {
+                    similarity_threshold = 0.95,
+                    ttl = 86400,
+                    embedding = {
+                        provider = "openai",
+                        endpoint = "https://api.openai.com/v1/embeddings",
+                        api_key = "sk-test",
+                    },
+                },
+                redis_host = "127.0.0.1",
+                redis_port = 6379,
+            })
+
+            if not ok then
+                ngx.say(err)
+            else
+                ngx.say("passed")
+            end
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 3: semantic without embedding config - should fail
+--- config
+    location /t {
+        content_by_lua_block {
+            local plugin = require("apisix.plugins.ai-cache")
+            local ok, err = plugin.check_schema({
+                layers = { "semantic" },
+                redis_host = "127.0.0.1",
+            })
+
+            if not ok then
+                ngx.say("failed: ", err)
+            else
+                ngx.say("passed")
+            end
+        }
+    }
+--- response_body
+failed: semantic layer requires semantic.embedding to be configured
+
+
+
+=== TEST 4: invalid layer value - should fail
+--- config
+    location /t {
+        content_by_lua_block {
+            local plugin = require("apisix.plugins.ai-cache")
+            local ok, err = plugin.check_schema({
+                layers = { "invalid_layer" },
+            })
+
+            if not ok then
+                ngx.say(err)
+            else
+                ngx.say("passed")
+            end
+        }
+    }
+--- response_body eval
+qr/.*property "layers" validation failed:.*matches none of the enum values.*/
+
+
+
+=== TEST 5: unsupported embedding provider - should fail
+--- config
+    location /t {
+        content_by_lua_block {
+            local plugin = require("apisix.plugins.ai-cache")
+            local ok, err = plugin.check_schema({
+                layers = { "semantic" },
+                semantic = {
+                    embedding = {
+                        provider = "some-unknown-provider",
+                        endpoint = "https://example.com/embeddings",
+                        api_key = "key",
+                    },
+                },
+            })
+
+            if not ok then
+                ngx.say(err)
+            else
+                ngx.say("passed")
+            end
+        }
+    }
+--- response_body eval
+qr/.*property "provider" validation failed: matches none of the enum values.*/
+
+
+
+=== TEST 6: similarity_threshold out of range - should fail
+--- config
+    location /t {
+        content_by_lua_block {
+            local plugin = require("apisix.plugins.ai-cache")
+            local ok, err = plugin.check_schema({
+                layers = { "semantic" },
+                semantic = {
+                    similarity_threshold = 1.5,
+                    embedding = {
+                        provider = "openai",
+                        endpoint = "https://api.openai.com/v1/embeddings",
+                        api_key = "sk-test",
+                    },
+                },
+            })
+
+            if not ok then
+                ngx.say(err)
+            else
+                ngx.say("passed")
+            end
+        }
+    }
+--- response_body eval
+qr/.*property "similarity_threshold" validation failed: expected 1\.5 to be at most.*/
+
+
+
+=== TEST 7: layers empty array - should fail (minItems=1)
+--- config
+    location /t {
+        content_by_lua_block {
+            local plugin = require("apisix.plugins.ai-cache")
+            local ok, err = plugin.check_schema({
+                layers = {},
+                redis_host = "127.0.0.1",
+            })
+
+            if not ok then
+                ngx.say(err)
+            else
+                ngx.say("passed")
+            end
+        }
+    }
+--- response_body eval
+qr/.*property "layers" validation failed: expect array to have at least 1 items.*/
+
+
+
+=== TEST 8: set up route for L1 cache tests
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/1',
+                ngx.HTTP_PUT,
+                [[{
+                    "uri": "/exact",
+                    "plugins": {
+                        "ai-proxy": {
+                            "provider": "openai",
+                            "auth": {
+                                "header": {
+                                    "Authorization": "Bearer test-key"
+                                }
+                            },
+                            "override": {
+                                "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+                            }
+                        },
+                        "ai-cache": {
+                            "layers": ["exact"],
+                            "exact": { "ttl": 60 },
+                            "redis_host": "127.0.0.1",
+                            "bypass_on": [{"header": "X-Cache-Bypass", "equals": "1"}]
+                        }
+                    }
+                }]]
+            )
+
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.say(body)
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 9: first request - cache MISS, upstream called
+--- request
+POST /exact
+{"messages":[{"role":"user","content":"What is the answer to life?"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- response_body_like eval
+qr/\{ "content": "1 \+ 1 = 2\.", "role": "assistant" \}/
+
+
+
+=== TEST 10: second identical request - cache HIT-L1, no upstream call
+--- request
+POST /exact
+{"messages":[{"role":"user","content":"What is the answer to life?"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L1
+--- response_headers_like
+X-AI-Cache-Age: \d+
+--- response_body_like eval
+qr/"content":\s?"1 \+ 1 = 2\."/
+--- error_log
+ai-cache: L1 hit for key
+
+
+
+=== TEST 11: bypass header - BYPASS, upstream called, not cached
+--- request
+POST /exact
+{"messages":[{"role":"user","content":"What is the bypass question?"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+X-Cache-Bypass: 1
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: BYPASS
+
+
+
+=== TEST 12: same prompt without bypass after bypass - still MISS (bypass did not cache)
+--- request
+POST /exact
+{"messages":[{"role":"user","content":"What is the bypass question?"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+
+
+
+=== TEST 13: set up route with two bypass rules
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/1',
+                ngx.HTTP_PUT,
+                [[{
+                    "uri": "/exact",
+                    "plugins": {
+                        "ai-proxy": {
+                            "provider": "openai",
+                            "auth": {
+                                "header": {
+                                    "Authorization": "Bearer test-key"
+                                }
+                            },
+                            "override": {
+                                "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+                            }
+                        },
+                        "ai-cache": {
+                            "layers": ["exact"],
+                            "exact": { "ttl": 60 },
+                            "redis_host": "127.0.0.1",
+                            "bypass_on": [
+                                {"header": "X-Cache-Bypass", "equals": "1"},
+                                {"header": "X-Debug",        "equals": "true"}
+                            ]
+                        }
+                    }
+                }]]
+            )
+
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.say(body)
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 14: first bypass rule matches - BYPASS
+--- request
+POST /exact
+{"messages":[{"role":"user","content":"multi-rule bypass test"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+X-Cache-Bypass: 1
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: BYPASS
+
+
+
+=== TEST 15: second bypass rule matches - BYPASS
+--- request
+POST /exact
+{"messages":[{"role":"user","content":"multi-rule bypass test"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+X-Debug: true
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: BYPASS
+
+
+
+=== TEST 16: set up route for upstream-status filter tests
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/2',
+                ngx.HTTP_PUT,
+                [[{
+                    "uri": "/error",
+                    "plugins": {
+                        "ai-proxy": {
+                            "provider": "openai",
+                            "auth": {
+                                "header": {
+                                    "Authorization": "Bearer test-key"
+                                }
+                            },
+                            "override": {
+                                "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+                            }
+                        },
+                        "ai-cache": {
+                            "layers": ["exact"],
+                            "exact": { "ttl": 60 },
+                            "redis_host": "127.0.0.1"
+                        }
+                    }
+                }]]
+            )
+
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.say(body)
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 17: non-2xx upstream response - not cached (status code filter)
+--- request
+POST /error
+{"messages":[{"role":"user","content":"trigger a server error"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+X-AI-Fixture-Status: 500
+--- error_code: 500
+--- response_headers
+X-AI-Cache-Status: MISS
+
+
+
+=== TEST 18: same prompt after non-2xx - still MISS (was not cached)
+--- request
+POST /error
+{"messages":[{"role":"user","content":"trigger a server error"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+X-AI-Fixture-Status: 500
+--- error_code: 500
+--- response_headers
+X-AI-Cache-Status: MISS
+
+
+
+=== TEST 19: set up route with very small max_cache_body_size
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/3',
+                ngx.HTTP_PUT,
+                [[{
+                    "uri": "/tiny",
+                    "plugins": {
+                        "ai-proxy": {
+                            "provider": "openai",
+                            "auth": {
+                                "header": {
+                                    "Authorization": "Bearer test-key"
+                                }
+                            },
+                            "override": {
+                                "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+                            }
+                        },
+                        "ai-cache": {
+                            "layers": ["exact"],
+                            "exact": { "ttl": 60 },
+                            "max_cache_body_size": 5,
+                            "redis_host": "127.0.0.1"
+                        }
+                    }
+                }]]
+            )
+
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.say(body)
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 20: oversize response - MISS, log warns and skips cache write
+--- request
+POST /tiny
+{"messages":[{"role":"user","content":"oversize body test"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- response_body_like eval
+qr/\{ "content": "1 \+ 1 = 2\.", "role": "assistant" \}/
+--- error_log
+exceeds max_cache_body_size
+
+
+
+=== TEST 21: same prompt after oversize - still MISS (was not cached)
+--- request
+POST /tiny
+{"messages":[{"role":"user","content":"oversize body test"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- response_body_like eval
+qr/\{ "content": "1 \+ 1 = 2\.", "role": "assistant" \}/
+--- error_log
+exceeds max_cache_body_size
+
+
+
+=== TEST 22: set up route with custom cache header names
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/4',
+                ngx.HTTP_PUT,
+                [[{
+                    "uri": "/custom-headers",
+                    "plugins": {
+                        "ai-proxy": {
+                            "provider": "openai",
+                            "auth": {
+                                "header": {
+                                    "Authorization": "Bearer test-key"
+                                }
+                            },
+                            "override": {
+                                "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+                            }
+                        },
+                        "ai-cache": {
+                            "layers": ["exact"],
+                            "exact": { "ttl": 60 },
+                            "headers": {
+                                "cache_status": "X-Custom-Status",
+                                "cache_age":    "X-Custom-Age"
+                            },
+                            "redis_host": "127.0.0.1"
+                        }
+                    }
+                }]]
+            )
+
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.say(body)
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 23: MISS populates the cache and emits custom status header
+--- request
+POST /custom-headers
+{"messages":[{"role":"user","content":"custom header test"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-Custom-Status: MISS
+--- response_body_like eval
+qr/\{ "content": "1 \+ 1 = 2\.", "role": "assistant" \}/
+--- wait: 1
+
+
+
+=== TEST 24: HIT emits custom status and age headers (defaults not used)
+--- request
+POST /custom-headers
+{"messages":[{"role":"user","content":"custom header test"}]}
+--- more_headers
+Content-Type: application/json
+--- error_code: 200
+--- response_headers
+X-Custom-Status: HIT-L1
+X-AI-Cache-Status:
+X-AI-Cache-Age:
+--- response_headers_like
+X-Custom-Age: \d+
+--- response_body_like eval
+qr/"content":\s?"1 \+ 1 = 2\."/
+
+
+
+=== TEST 25: clean up Redis cache state before semantic tests
+--- config
+    location /t {
+        content_by_lua_block {
+            local redis = require("resty.redis")
+            local red = redis:new()
+            red:set_timeout(1000)
+            assert(red:connect("127.0.0.1", 6379))
+
+            red["FT.DROPINDEX"](red, "ai-cache-idx-3", "DD")
+
+            local keys = red:keys("ai-cache:*")
+            if type(keys) == "table" and #keys > 0 then
+                red:del(unpack(keys))
+            end
+
+            red:close()
+            ngx.say("ok")
+        }
+    }
+--- response_body
+ok
+
+
+
+=== TEST 26: set up route for L2 semantic cache tests
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/5',
+                ngx.HTTP_PUT,
+                [[{
+                    "uri": "/semantic",
+                    "plugins": {
+                        "ai-proxy": {
+                            "provider": "openai",
+                            "auth": {
+                                "header": {
+                                    "Authorization": "Bearer test-key"
+                                }
+                            },
+                            "override": {
+                                "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+                            }
+                        },
+                        "ai-cache": {
+                            "layers": ["exact", "semantic"],
+                            "exact": {
+                                "ttl": 60
+                            },
+                            "semantic": {
+                                "similarity_threshold": 0.90,
+                                "ttl": 300,
+                                "embedding": {
+                                    "provider": "openai",
+                                    "endpoint": "http://127.0.0.1:1990/v1/embeddings",
+                                    "api_key": "test-key"
+                                }
+                            },
+                            "redis_host": "127.0.0.1"
+                        }
+                    }
+                }]]
+            )
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.say(body)
+        }
+    }
+--- response_body
+passed
+
+
+
+=== TEST 27: L2 - first request, cache MISS, stored in L2
+--- request
+POST /semantic
+{"messages":[{"role":"user","content":"What is the capital of France??"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- response_body_like eval
+qr/\{ "content": "1 \+ 1 = 2\.", "role": "assistant" \}/
+
+
+
+=== TEST 28: L2 - different wording hits L2 (same vector from fixture)
+--- request
+POST /semantic
+{"messages":[{"role":"user","content":"Name the capital city of France"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L2
+--- response_headers_like
+X-AI-Cache-Similarity: \d+(\.\d+)?
+--- response_body_like eval
+qr/"content":\s?"1 \+ 1 = 2\."/
+--- error_log
+ai-cache: L2 hit
+
+
+
+=== TEST 29: L2 - paraphrase now hits L1 (backfilled by the previous L2 hit)
+--- request
+POST /semantic
+{"messages":[{"role":"user","content":"Name the capital city of France"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L1
+--- response_body_like eval
+qr/"content":\s?"1 \+ 1 = 2\."/
+--- error_log
+ai-cache: L1 hit for key
+
+
+
+=== TEST 30: streaming MISS - upstream called, response cached via log phase
+--- request
+POST /exact
+{"messages":[{"role":"user","content":"Stream me something cool"}],"stream":true}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-streaming.sse
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- response_body_like eval
+qr/data:.*"content":"Hello"/
+
+
+
+=== TEST 31: streaming HIT - Content-Type is text/event-stream, SSE body returned
+--- request
+POST /exact
+{"messages":[{"role":"user","content":"Stream me something cool"}],"stream":true}
+--- more_headers
+Content-Type: application/json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L1
+Content-Type: text/event-stream
+--- response_body_like eval
+qr/data:.*"content":\s?"Hello!"/
+--- wait: 1
+
+
+
+=== TEST 32: non-streaming HIT after streaming MISS - returns JSON
+--- request
+POST /exact
+{"messages":[{"role":"user","content":"Stream me something cool"}]}
+--- more_headers
+Content-Type: application/json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L1
+Content-Type: application/json
+--- response_body_like eval
+qr/"content":\s?"Hello!"/
diff --git a/t/plugin/prometheus-ai-cache.t b/t/plugin/prometheus-ai-cache.t
new file mode 100644
index 000000000000..3af1a6ae2491
--- /dev/null
+++ b/t/plugin/prometheus-ai-cache.t
@@ -0,0 +1,481 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+BEGIN {
+    $ENV{TEST_ENABLE_CONTROL_API_V1} = "0";
+
+    if ($ENV{TEST_NGINX_CHECK_LEAK}) {
+        $SkipReason = "unavailable for the hup tests";
+    } else {
+        $ENV{TEST_NGINX_USE_HUP} = 1;
+        undef $ENV{TEST_NGINX_USE_STAP};
+    }
+}
+
+use t::APISIX 'no_plan';
+
+repeat_each(1);
+no_long_string();
+no_shuffle();
+no_root_location();
+
+add_block_preprocessor(sub {
+    my ($block) = @_;
+
+    if (!defined $block->request) {
+        $block->set_value("request", "GET /t");
+    }
+
+    my $user_yaml_config = <<_EOC_;
+plugin_attr:
+    prometheus:
+        refresh_interval: 0.1
+plugins:
+  - ai-proxy
+  - ai-cache
+  - prometheus
+  - public-api
+  - key-auth
+_EOC_
+    $block->set_value("extra_yaml_config", $user_yaml_config);
+
+    if (!defined $block->http_config) {
+        $block->set_value("http_config", <<_EOC_);
+server {
+    listen 1990;
+    default_type 'application/json';
+
+    location /v1/embeddings {
+        content_by_lua_block {
+            local fixture_loader = require("lib.fixture_loader")
+            local content, err = fixture_loader.load("openai/embeddings-list.json")
+            if not content then
+                ngx.status = 500
+                ngx.say(err)
+                return
+            end
+
+            ngx.status = 200
+            ngx.print(content)
+        }
+    }
+
+    location /v1/embeddings-fail {
+        content_by_lua_block {
+            ngx.status = 500
+            ngx.say('{"error":"simulated embedding failure"}')
+        }
+    }
+}
+_EOC_
+    }
+});
+
+run_tests;
+
+__DATA__
+
+=== TEST 1: set up routes
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+
+            local routes = {
+                {
+                    url = "/apisix/admin/routes/1",
+                    data = [[{
+                        "uri": "/exact",
+                        "plugins": {
+                            "prometheus": {},
+                            "ai-proxy": {
+                                "provider": "openai",
+                                "auth": {
+                                    "header": {
+                                        "Authorization": "Bearer test-key"
+                                    }
+                                },
+                                "override": {
+                                    "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+                                }
+                            },
+                            "ai-cache": {
+                                "layers": ["exact"],
+                                "exact": { "ttl": 60 },
+                                "redis_host": "127.0.0.1",
+                                "bypass_on": [{"header": "X-Cache-Bypass", "equals": "1"}]
+                            }
+                        }
+                    }]],
+                },
+                {
+                    url = "/apisix/admin/routes/2",
+                    data = [[{
+                        "uri": "/semantic",
+                        "plugins": {
+                            "prometheus": {},
+                            "ai-proxy": {
+                                "provider": "openai",
+                                "auth": {
+                                    "header": {
+                                        "Authorization": "Bearer test-key"
+                                    }
+                                },
+                                "override": {
+                                    "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+                                }
+                            },
+                            "ai-cache": {
+                                "layers": ["exact", "semantic"],
+                                "exact": { "ttl": 60 },
+                                "semantic": {
+                                    "similarity_threshold": 0.90,
+                                    "ttl": 300,
+                                    "embedding": {
+                                        "provider": "openai",
+                                        "endpoint": "http://127.0.0.1:1990/v1/embeddings",
+                                        "api_key": "test-key"
+                                    }
+                                },
+                                "redis_host": "127.0.0.1"
+                            }
+                        }
+                    }]],
+                },
+                {
+                    url = "/apisix/admin/routes/3",
+                    data = [[{
+                        "uri": "/semantic-fail",
+                        "plugins": {
+                            "prometheus": {},
+                            "ai-proxy": {
+                                "provider": "openai",
+                                "auth": {
+                                    "header": {
+                                        "Authorization": "Bearer test-key"
+                                    }
+                                },
+                                "override": {
+                                    "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+                                }
+                            },
+                            "ai-cache": {
+                                "layers": ["semantic"],
+                                "semantic": {
+                                    "similarity_threshold": 0.90,
+                                    "ttl": 300,
+                                    "embedding": {
+                                        "provider": "openai",
+                                        "endpoint": "http://127.0.0.1:1990/v1/embeddings-fail",
+                                        "api_key": "test-key"
+                                    }
+                                },
+                                "redis_host": "127.0.0.1"
+                            }
+                        }
+                    }]],
+                },
+                {
+                    url = "/apisix/admin/routes/4",
+                    data = [[{
+                        "uri": "/exact-auth",
+                        "plugins": {
+                            "prometheus": {},
+                            "key-auth": {},
+                            "ai-proxy": {
+                                "provider": "openai",
+                                "auth": {
+                                    "header": {
+                                        "Authorization": "Bearer test-key"
+                                    }
+                                },
+                                "override": {
+                                    "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+                                }
+                            },
+                            "ai-cache": {
+                                "layers": ["exact"],
+                                "exact": { "ttl": 60 },
+                                "redis_host": "127.0.0.1"
+                            }
+                        }
+                    }]],
+                },
+                {
+                    url = "/apisix/admin/consumers",
+                    data = [[{
+                        "username": "alice",
+                        "plugins": {
+                            "key-auth": {
+                                "key": "alice-key"
+                            }
+                        }
+                    }]],
+                },
+                {
+                    url = "/apisix/admin/routes/metrics",
+                    data = [[{
+                        "plugins": {
+                            "public-api": {}
+                        },
+                        "uri": "/apisix/prometheus/metrics"
+                    }]],
+                },
+            }
+
+            for _, route in ipairs(routes) do
+                local code, body = t(route.url, ngx.HTTP_PUT, route.data)
+                if code >= 300 then
+                    ngx.status = code
+                end
+                ngx.say(body)
+            end
+        }
+    }
+--- response_body eval
+"passed\n" x 6
+
+
+
+=== TEST 2: MISS request - upstream called
+--- request
+POST /exact
+{"messages":[{"role":"user","content":"What is the meaning of life?"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+
+
+
+=== TEST 3: same request - HIT-L1
+--- request
+POST /exact
+{"messages":[{"role":"user","content":"What is the meaning of life?"}]}
+--- more_headers
+Content-Type: application/json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L1
+--- wait: 1
+
+
+
+=== TEST 4: verify miss counter
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_misses_total\{route_id="1",service_id="",consumer=""\} 1/
+
+
+
+=== TEST 5: verify hit counter with layer label
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_hits_total\{route_id="1",service_id="",consumer="",layer="l1"\} 1/
+
+
+
+=== TEST 6: BYPASS request - upstream called, no cache interaction
+--- request
+POST /exact
+{"messages":[{"role":"user","content":"What is the meaning of life?"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+X-Cache-Bypass: 1
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: BYPASS
+
+
+
+=== TEST 7: verify BYPASS did not increment misses counter
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_misses_total\{route_id="1",service_id="",consumer=""\} 1\n/
+
+
+
+=== TEST 8: verify BYPASS did not increment hits counter
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_hits_total\{route_id="1",service_id="",consumer="",layer="l1"\} 1\n/
+
+
+
+=== TEST 9: cleanup Redis cache state before semantic tests
+--- config
+    location /t {
+        content_by_lua_block {
+            local redis = require("resty.redis")
+            local red = redis:new()
+            red:set_timeout(1000)
+            assert(red:connect("127.0.0.1", 6379))
+
+            red["FT.DROPINDEX"](red, "ai-cache-idx-3", "DD")
+
+            local keys = red:keys("ai-cache:*")
+            if type(keys) == "table" and #keys > 0 then
+                red:del(unpack(keys))
+            end
+
+            red:close()
+            ngx.say("ok")
+        }
+    }
+--- response_body
+ok
+
+
+
+=== TEST 10: L2 first request - MISS, embedding API called
+--- request
+POST /semantic
+{"messages":[{"role":"user","content":"What is the capital of France??"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- wait: 1
+
+
+
+=== TEST 11: L2 second request - different wording, HIT-L2
+--- request
+POST /semantic
+{"messages":[{"role":"user","content":"Name the capital city of France"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L2
+--- wait: 1
+
+
+
+=== TEST 12: verify miss counter for semantic route (route_id=2)
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_misses_total\{route_id="2",service_id="",consumer=""\} 1/
+
+
+
+=== TEST 13: verify hits counter with layer="l2"
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_hits_total\{route_id="2",service_id="",consumer="",layer="l2"\} 1/
+
+
+
+=== TEST 14: verify embedding latency histogram with provider label
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_embedding_latency_count\{route_id="2",service_id="",consumer="",provider="openai"\} 2/
+
+
+
+=== TEST 15: embedding failure - request still returns 200 via fallback
+--- request
+POST /semantic-fail
+{"messages":[{"role":"user","content":"What does this fail at?"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- wait: 1
+
+
+
+=== TEST 16: verify embedding_failures counter
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_embedding_failures_total\{route_id="3",service_id="",consumer=""\} 1/
+
+
+
+=== TEST 17: verify embedding-failure request also counted as miss
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_misses_total\{route_id="3",service_id="",consumer=""\} 1/
+
+
+
+=== TEST 18: authenticated MISS request - consumer alice
+--- request
+POST /exact-auth
+{"messages":[{"role":"user","content":"Authenticated cache test"}]}
+--- more_headers
+Content-Type: application/json
+apikey: alice-key
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+
+
+
+=== TEST 19: authenticated HIT-L1 request - consumer alice
+--- request
+POST /exact-auth
+{"messages":[{"role":"user","content":"Authenticated cache test"}]}
+--- more_headers
+Content-Type: application/json
+apikey: alice-key
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L1
+--- wait: 1
+
+
+
+=== TEST 20: verify consumer label is populated on hits counter
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_hits_total\{route_id="4",service_id="",consumer="alice",layer="l1"\} 1/
+
+
+
+=== TEST 21: verify consumer label is populated on misses counter
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_misses_total\{route_id="4",service_id="",consumer="alice"\} 1/
+
+
+
+=== TEST 22: verify cache hit is labelled as ai_chat (not traditional_http)
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_http_status\{code="200",route="1"[^}]*request_type="ai_chat"[^}]*response_source="apisix"[^}]*\} 1/