diff --git a/Makefile b/Makefile
index 71ab7df1eabf..ad074ecabbdf 100644
--- a/Makefile
+++ b/Makefile
@@ -388,6 +388,11 @@ install: runtime
$(ENV_INSTALL) -d $(ENV_INST_LUADIR)/apisix/plugins/ai-transport
$(ENV_INSTALL) apisix/plugins/ai-transport/*.lua $(ENV_INST_LUADIR)/apisix/plugins/ai-transport
+ $(ENV_INSTALL) -d $(ENV_INST_LUADIR)/apisix/plugins/ai-cache
+ $(ENV_INSTALL) apisix/plugins/ai-cache/*.lua $(ENV_INST_LUADIR)/apisix/plugins/ai-cache
+ $(ENV_INSTALL) -d $(ENV_INST_LUADIR)/apisix/plugins/ai-cache/embeddings
+ $(ENV_INSTALL) apisix/plugins/ai-cache/embeddings/*.lua $(ENV_INST_LUADIR)/apisix/plugins/ai-cache/embeddings
+
$(ENV_INSTALL) -d $(ENV_INST_LUADIR)/apisix/plugins/ai-rag/embeddings
$(ENV_INSTALL) apisix/plugins/ai-rag/embeddings/*.lua $(ENV_INST_LUADIR)/apisix/plugins/ai-rag/embeddings
$(ENV_INSTALL) -d $(ENV_INST_LUADIR)/apisix/plugins/ai-rag/vector-search
diff --git a/apisix/cli/config.lua b/apisix/cli/config.lua
index 956eef30c267..b4df580666c0 100644
--- a/apisix/cli/config.lua
+++ b/apisix/cli/config.lua
@@ -231,6 +231,7 @@ local _M = {
"ai-prompt-template",
"ai-prompt-decorator",
"ai-prompt-guard",
+ "ai-cache",
"ai-rag",
"ai-rate-limiting",
"ai-proxy-multi",
diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua
new file mode 100644
index 000000000000..476bd785d85a
--- /dev/null
+++ b/apisix/plugins/ai-cache.lua
@@ -0,0 +1,296 @@
+--
+-- Licensed to the Apache Software Foundation (ASF) under one or more
+-- contributor license agreements. See the NOTICE file distributed with
+-- this work for additional information regarding copyright ownership.
+-- The ASF licenses this file to You under the Apache License, Version 2.0
+-- (the "License"); you may not use this file except in compliance with
+-- the License. You may obtain a copy of the License at
+--
+-- http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-- See the License for the specific language governing permissions and
+-- limitations under the License.
+--
+
+local core = require("apisix.core")
+local schema = require("apisix.plugins.ai-cache.schema")
+local exact = require("apisix.plugins.ai-cache.exact")
+local semantic = require("apisix.plugins.ai-cache.semantic")
+local protocols = require("apisix.plugins.ai-protocols")
+local http = require("resty.http")
+local ngx = ngx
+local ngx_time = ngx.time
+local ngx_now = ngx.now
+local ipairs = ipairs
+local require = require
+local tostring = tostring
+local table_concat = table.concat
+
+local plugin_name = "ai-cache"
+
+local _M = {
+ version = 0.1,
+ priority = 1065,
+ name = plugin_name,
+ schema = schema.schema
+}
+
+
+local function layer_enabled(conf, name)
+ local layers = conf.layers or { "exact", "semantic" }
+ for _, l in ipairs(layers) do
+ if l == name then return true end
+ end
+ return false
+end
+
+
+local function populate_ai_ctx_on_hit(ctx, protocol_name, body_tab, is_stream, cached_text)
+ ctx.ai_client_protocol = protocol_name
+ ctx.var.request_type = is_stream and "ai_stream" or "ai_chat"
+ if body_tab.model then
+ ctx.var.request_llm_model = body_tab.model
+ ctx.var.llm_model = body_tab.model
+ end
+ ctx.var.llm_response_text = cached_text
+end
+
+
+function _M.check_schema(conf)
+ local ok, err = core.schema.check(schema.schema, conf)
+ if not ok then
+ return false, err
+ end
+
+ if layer_enabled(conf, "semantic") then
+ if not (conf.semantic and conf.semantic.embedding) then
+ return false, "semantic layer requires semantic.embedding to be configured"
+ end
+ end
+
+ core.utils.check_https({ "semantic.embedding.endpoint" }, conf, plugin_name)
+
+ return true
+end
+
+
+function _M.access(conf, ctx)
+ -- Check bypass_on conditions
+ if conf.bypass_on then
+ local req_headers = ngx.req.get_headers()
+ for _, rule in ipairs(conf.bypass_on) do
+ if req_headers[rule.header] == rule.equals then
+ ctx.ai_cache_status = "BYPASS"
+ return
+ end
+ end
+ end
+
+ local body_tab, err = core.request.get_json_request_body_table()
+ if not body_tab then
+ core.log.warn("ai-cache: failed to read request body: ", err or "unknown error")
+ ctx.ai_cache_status = "MISS"
+ return
+ end
+
+ local protocol_name = protocols.detect(body_tab, ctx)
+ if not protocol_name then
+ core.log.warn("ai-cache: could not detect AI protocol, skipping cache")
+ ctx.ai_cache_status = "MISS"
+ return
+ end
+
+ local proto = protocols.get(protocol_name)
+ local contents = proto.extract_request_content(body_tab)
+ if not contents or #contents == 0 then
+ ctx.ai_cache_status = "MISS"
+ return
+ end
+
+ local prompt_text = table_concat(contents, " ")
+ local scope_hash = exact.compute_scope_hash(conf, ctx)
+ local prompt_hash = exact.compute_prompt_hash(prompt_text)
+
+ local is_stream = body_tab.stream == true
+
+ -- L1 exact lookup
+ if layer_enabled(conf, "exact") then
+ local cached_text, written_at, lookup_err = exact.get(conf, scope_hash, prompt_hash)
+ if lookup_err then
+ core.log.warn("ai-cache: L1 lookup error: ", lookup_err)
+ elseif cached_text then
+ core.log.info("ai-cache: L1 hit for key: ", prompt_hash)
+ ctx.ai_cache_status = "HIT-L1"
+ ctx.ai_cache_written_at = written_at
+ if is_stream then
+ core.response.set_header("Content-Type", "text/event-stream")
+ else
+ core.response.set_header("Content-Type", "application/json")
+ end
+ populate_ai_ctx_on_hit(ctx, protocol_name, body_tab, is_stream, cached_text)
+ -- TODO: rename build_deny_response to build_response_from_text in a
+ -- follow-up. We use it here to wrap cached text in the protocol's
+ -- response shape, not for policy denial.
+ return 200, proto.build_deny_response({
+ stream = is_stream,
+ text = cached_text,
+ })
+ end
+ end
+
+ -- L2 semantic lookup
+ if layer_enabled(conf, "semantic") then
+ local emb_conf = conf.semantic.embedding
+ local emb_driver = require("apisix.plugins.ai-cache.embeddings." .. emb_conf.provider)
+ local httpc = http.new()
+
+ local t0 = ngx_now()
+ local embedding, _, emb_err = emb_driver.get_embeddings(
+ emb_conf, prompt_text, httpc, emb_conf.ssl_verify
+ )
+ if not embedding then
+ core.log.warn("ai-cache: embedding fetch failed (degrading to MISS): ", emb_err)
+ ctx.ai_cache_embedding_failed = true
+ else
+ ctx.ai_cache_embedding_latency_ms = (ngx_now() - t0) * 1000
+ ctx.ai_cache_embedding_provider = emb_conf.provider
+ ctx.ai_cache_embedding = embedding
+
+ local threshold = conf.semantic.similarity_threshold or 0.95
+ local cached_text, similarity, search_err = semantic.search(
+ conf, scope_hash, embedding, threshold
+ )
+
+ if search_err then
+ core.log.warn("ai-cache: L2 search error (degrading to MISS): ", search_err)
+ elseif cached_text then
+ core.log.info("ai-cache: L2 hit, similarity=", similarity)
+
+ if layer_enabled(conf, "exact") then
+ local l1_ttl = (conf.exact and conf.exact.ttl) or 3600
+ local l1_err = exact.set(
+ conf, scope_hash, prompt_hash, cached_text, l1_ttl
+ )
+ if l1_err then
+ core.log.warn("ai-cache: L2->L1 backfill failed: ", l1_err)
+ end
+ end
+
+ ctx.ai_cache_status = "HIT-L2"
+ ctx.ai_cache_similarity = similarity
+ if is_stream then
+ core.response.set_header("Content-Type", "text/event-stream")
+ else
+ core.response.set_header("Content-Type", "application/json")
+ end
+ populate_ai_ctx_on_hit(ctx, protocol_name, body_tab, is_stream, cached_text)
+ return 200, proto.build_deny_response({
+ stream = is_stream,
+ text = cached_text,
+ })
+ end
+ end
+ end
+
+ ctx.ai_cache_status = "MISS"
+ ctx.ai_cache_scope_hash = scope_hash
+ ctx.ai_cache_prompt_hash = prompt_hash
+ ctx.ai_cache_prompt_text = prompt_text
+end
+
+
+function _M.header_filter(conf, ctx)
+ if not ctx.ai_cache_status then
+ return
+ end
+
+ local status_header = (conf.headers and conf.headers.cache_status)
+ or "X-AI-Cache-Status"
+ ngx.header[status_header] = ctx.ai_cache_status
+
+ if ctx.ai_cache_status == "HIT-L1" and ctx.ai_cache_written_at then
+ local age_header = (conf.headers and conf.headers.cache_age)
+ or "X-AI-Cache-Age"
+ ngx.header[age_header] = tostring(ngx_time() - ctx.ai_cache_written_at)
+ end
+
+ if ctx.ai_cache_status == "HIT-L2" and ctx.ai_cache_similarity then
+ local sim_header = (conf.headers and conf.headers.cache_similarity)
+ or "X-AI-Cache-Similarity"
+ ngx.header[sim_header] = tostring(ctx.ai_cache_similarity)
+ end
+end
+
+
+function _M.log(conf, ctx)
+ if ctx.ai_cache_status ~= "MISS" then
+ return
+ end
+
+ -- Early-MISS paths (body parse / protocol detect / empty content) skip
+ -- key computation, so bail out if cache key fields are absent.
+ if not ctx.ai_cache_prompt_hash or not ctx.ai_cache_prompt_text then
+ return
+ end
+
+ local upstream_status = core.response.get_upstream_status(ctx) or ngx.status
+ if not upstream_status or upstream_status < 200 or upstream_status >= 300 then
+ return
+ end
+
+ local response_text = ctx.var.llm_response_text
+ if not response_text or response_text == "" then
+ return
+ end
+
+ local max_size = conf.max_cache_body_size or 1048576
+ if #response_text > max_size then
+ core.log.warn("ai-cache: response size ", #response_text,
+ " exceeds max_cache_body_size ", max_size,
+ ", skipping cache write")
+ return
+ end
+
+ local exact_enabled = layer_enabled(conf, "exact")
+ local semantic_enabled = layer_enabled(conf, "semantic")
+ local ttl_exact = (conf.exact and conf.exact.ttl) or 3600
+ local scope_hash = ctx.ai_cache_scope_hash
+ local prompt_hash = ctx.ai_cache_prompt_hash
+ local embedding = ctx.ai_cache_embedding
+
+ local ok, timer_err = ngx.timer.at(0, function(premature)
+ if premature then
+ return
+ end
+
+ if exact_enabled then
+ local err = exact.set(conf, scope_hash, prompt_hash, response_text, ttl_exact)
+ if err then
+ ngx.log(ngx.WARN, "ai-cache: failed to write L1 cache: ", err)
+ end
+ end
+
+ if semantic_enabled then
+ if not embedding then
+ return
+ end
+
+ local ttl_semantic = (conf.semantic and conf.semantic.ttl) or 86400
+ local store_err = semantic.store(
+ conf, scope_hash, embedding, response_text, ttl_semantic
+ )
+ if store_err then
+ ngx.log(ngx.WARN, "ai-cache: failed to write L2 cache: ", store_err)
+ end
+ end
+ end)
+ if not ok then
+ core.log.warn("ai-cache: failed to schedule cache write: ", timer_err)
+ end
+end
+
+
+return _M
diff --git a/apisix/plugins/ai-cache/embeddings/azure_openai.lua b/apisix/plugins/ai-cache/embeddings/azure_openai.lua
new file mode 100644
index 000000000000..6f862ea78cc8
--- /dev/null
+++ b/apisix/plugins/ai-cache/embeddings/azure_openai.lua
@@ -0,0 +1,76 @@
+--
+-- Licensed to the Apache Software Foundation (ASF) under one or more
+-- contributor license agreements. See the NOTICE file distributed with
+-- this work for additional information regarding copyright ownership.
+-- The ASF licenses this file to You under the Apache License, Version 2.0
+-- (the "License"); you may not use this file except in compliance with
+-- the License. You may obtain a copy of the License at
+--
+-- http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-- See the License for the specific language governing permissions and
+-- limitations under the License.
+--
+
+local core = require("apisix.core")
+local type = type
+
+local ngx = ngx
+local HTTP_OK = ngx.HTTP_OK
+local HTTP_INTERNAL_SERVER_ERROR = ngx.HTTP_INTERNAL_SERVER_ERROR
+
+local _M = {}
+
+
+function _M.get_embeddings(conf, text, httpc, ssl_verify)
+ local body, err = core.json.encode({ input = text })
+ if not body then
+ return nil, HTTP_INTERNAL_SERVER_ERROR, err
+ end
+
+ httpc:set_timeout(conf.timeout)
+
+ local res, err = httpc:request_uri(conf.endpoint, {
+ method = "POST",
+ headers = {
+ ["Content-Type"] = "application/json",
+ ["api-key"] = conf.api_key,
+ },
+ body = body,
+ ssl_verify = ssl_verify,
+ keepalive = true,
+ })
+
+ if not res or not res.body then
+ return nil, HTTP_INTERNAL_SERVER_ERROR, err or "no response from embeddings API"
+ end
+
+ if res.status ~= HTTP_OK then
+ return nil, res.status, res.body
+ end
+
+ local res_tab, err = core.json.decode(res.body)
+ if not res_tab then
+ return nil, HTTP_INTERNAL_SERVER_ERROR, err
+ end
+
+ if type(res_tab.data) ~= "table" or core.table.isempty(res_tab.data) then
+ return nil, HTTP_INTERNAL_SERVER_ERROR, "unexpected embedding response: " .. res.body
+ end
+
+ local embedding = res_tab.data[1].embedding
+ if type(embedding) ~= "table" then
+ return nil, HTTP_INTERNAL_SERVER_ERROR, "missing embedding field in response"
+ end
+ if #embedding == 0 then
+ return nil, HTTP_INTERNAL_SERVER_ERROR, "embedding vector is empty"
+ end
+
+ return embedding, nil, nil
+end
+
+
+return _M
diff --git a/apisix/plugins/ai-cache/embeddings/openai.lua b/apisix/plugins/ai-cache/embeddings/openai.lua
new file mode 100644
index 000000000000..740b12d23f2d
--- /dev/null
+++ b/apisix/plugins/ai-cache/embeddings/openai.lua
@@ -0,0 +1,79 @@
+--
+-- Licensed to the Apache Software Foundation (ASF) under one or more
+-- contributor license agreements. See the NOTICE file distributed with
+-- this work for additional information regarding copyright ownership.
+-- The ASF licenses this file to You under the Apache License, Version 2.0
+-- (the "License"); you may not use this file except in compliance with
+-- the License. You may obtain a copy of the License at
+--
+-- http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-- See the License for the specific language governing permissions and
+-- limitations under the License.
+--
+
+local core = require("apisix.core")
+local type = type
+
+local ngx = ngx
+local HTTP_OK = ngx.HTTP_OK
+local HTTP_INTERNAL_SERVER_ERROR = ngx.HTTP_INTERNAL_SERVER_ERROR
+
+local _M = {}
+
+
+function _M.get_embeddings(conf, text, httpc, ssl_verify)
+ local body, err = core.json.encode({
+ input = text,
+ model = conf.model or "text-embedding-3-small",
+ })
+ if not body then
+ return nil, HTTP_INTERNAL_SERVER_ERROR, err
+ end
+
+ httpc:set_timeout(conf.timeout)
+
+ local res, err = httpc:request_uri(conf.endpoint, {
+ method = "POST",
+ headers = {
+ ["Content-Type"] = "application/json",
+ ["Authorization"] = "Bearer " .. conf.api_key,
+ },
+ body = body,
+ ssl_verify = ssl_verify,
+ keepalive = true,
+ })
+
+ if not res or not res.body then
+ return nil, HTTP_INTERNAL_SERVER_ERROR, err or "no response from embeddings API"
+ end
+
+ if res.status ~= HTTP_OK then
+ return nil, res.status, res.body
+ end
+
+ local res_tab, err = core.json.decode(res.body)
+ if not res_tab then
+ return nil, HTTP_INTERNAL_SERVER_ERROR, err
+ end
+
+ if type(res_tab.data) ~= "table" or core.table.isempty(res_tab.data) then
+ return nil, HTTP_INTERNAL_SERVER_ERROR, "unexpected embedding response: " .. res.body
+ end
+
+ local embedding = res_tab.data[1].embedding
+ if type(embedding) ~= "table" then
+ return nil, HTTP_INTERNAL_SERVER_ERROR, "missing embedding field in response"
+ end
+ if #embedding == 0 then
+ return nil, HTTP_INTERNAL_SERVER_ERROR, "embedding vector is empty"
+ end
+
+ return embedding, nil, nil
+end
+
+
+return _M
diff --git a/apisix/plugins/ai-cache/exact.lua b/apisix/plugins/ai-cache/exact.lua
new file mode 100644
index 000000000000..e1a63f9a5f41
--- /dev/null
+++ b/apisix/plugins/ai-cache/exact.lua
@@ -0,0 +1,136 @@
+--
+-- Licensed to the Apache Software Foundation (ASF) under one or more
+-- contributor license agreements. See the NOTICE file distributed with
+-- this work for additional information regarding copyright ownership.
+-- The ASF licenses this file to You under the Apache License, Version 2.0
+-- (the "License"); you may not use this file except in compliance with
+-- the License. You may obtain a copy of the License at
+--
+-- http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-- See the License for the specific language governing permissions and
+-- limitations under the License.
+--
+
+local core = require("apisix.core")
+local redis = require("apisix.utils.redis")
+local resty_sha256 = require("resty.sha256")
+local to_hex = require("resty.string").to_hex
+
+local ngx = ngx
+local ngx_time = ngx.time
+local ipairs = ipairs
+local tostring = tostring
+local table_concat = table.concat
+
+local KEY_PREFIX = "ai-cache:l1:"
+
+local _M = {}
+
+
+local function sha256_hex(s)
+ local hash = resty_sha256:new()
+ hash:update(s)
+ return to_hex(hash:final())
+end
+
+
+function _M.compute_scope_hash(conf, ctx)
+ local cache_key = conf.cache_key
+ if not cache_key then
+ return ""
+ end
+
+ local parts = {}
+ local n = 0
+
+ if cache_key.include_consumer then
+ n = n + 1
+ parts[n] = ctx.consumer_name or ""
+ end
+
+ if cache_key.include_vars then
+ for _, var_name in ipairs(cache_key.include_vars) do
+ local key = var_name
+ if key:sub(1, 1) == "$" then
+ key = key:sub(2)
+ end
+ n = n + 1
+ parts[n] = tostring(ctx.var[key] or "")
+ end
+ end
+
+ if n == 0 then
+ return ""
+ end
+
+ return sha256_hex(table_concat(parts, "|"))
+end
+
+
+function _M.compute_prompt_hash(text)
+ return sha256_hex(text)
+end
+
+
+function _M.get(conf, scope_hash, prompt_hash)
+ local red, err = redis.new(conf)
+ if not red then
+ return nil, nil, err
+ end
+
+ local key = KEY_PREFIX .. scope_hash .. ":" .. prompt_hash
+ local res, get_err = red:get(key)
+ if get_err then
+ red:close()
+ return nil, nil, get_err
+ end
+
+ red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool)
+
+ if res == ngx.null then
+ return nil, nil, nil
+ end
+
+ local entry, decode_err = core.json.decode(res)
+ if not entry then
+ return nil, nil, "corrupt cache entry: " .. decode_err
+ end
+
+ return entry.text, entry.written_at, nil
+end
+
+
+function _M.set(conf, scope_hash, prompt_hash, text, ttl)
+ local red, err = redis.new(conf)
+ if not red then
+ return err
+ end
+
+ local key = KEY_PREFIX .. scope_hash .. ":" .. prompt_hash
+ local entry, encode_err = core.json.encode({
+ text = text,
+ written_at = ngx_time(),
+ })
+
+ if not entry then
+ red:close()
+ return encode_err
+ end
+
+ local ok, set_err = red:set(key, entry, "EX", ttl)
+ if not ok then
+ red:close()
+ return set_err
+ end
+
+ red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool)
+
+ return nil
+end
+
+
+return _M
diff --git a/apisix/plugins/ai-cache/schema.lua b/apisix/plugins/ai-cache/schema.lua
new file mode 100644
index 000000000000..02587f7fb14c
--- /dev/null
+++ b/apisix/plugins/ai-cache/schema.lua
@@ -0,0 +1,189 @@
+--
+-- Licensed to the Apache Software Foundation (ASF) under one or more
+-- contributor license agreements. See the NOTICE file distributed with
+-- this work for additional information regarding copyright ownership.
+-- The ASF licenses this file to You under the Apache License, Version 2.0
+-- (the "License"); you may not use this file except in compliance with
+-- the License. You may obtain a copy of the License at
+--
+-- http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-- See the License for the specific language governing permissions and
+-- limitations under the License.
+--
+
+local redis_schema = require("apisix.utils.redis-schema")
+
+local _M = {}
+
+local embedding_schema = {
+ type = "object",
+ properties = {
+ provider = {
+ type = "string",
+ enum = { "openai", "azure_openai" },
+ description = "Embedding API provider.",
+ },
+ model = {
+ type = "string",
+ description = "Embedding model name. Sent in the request body for "
+ .. "provider: openai; ignored for provider: azure_openai "
+ .. "(Azure infers the model from the deployment URL).",
+ },
+ endpoint = {
+ type = "string",
+ description = "Embedding API endpoint URL.",
+ },
+ api_key = {
+ type = "string",
+ description = "API key for the embedding provider.",
+ },
+ timeout = {
+ type = "integer",
+ minimum = 1,
+ maximum = 600000,
+ default = 5000,
+ description = "HTTP request timeout in milliseconds for embedding API calls.",
+ },
+ ssl_verify = {
+ type = "boolean",
+ default = true,
+ description = "Whether to verify the embedding endpoint's TLS certificate.",
+ },
+ },
+ required = { "provider", "endpoint", "api_key" },
+}
+
+local semantic_schema = {
+ type = "object",
+ properties = {
+ similarity_threshold = {
+ type = "number",
+ minimum = 0,
+ maximum = 1,
+ default = 0.95,
+ description = "Minimum cosine similarity required for a semantic-layer hit.",
+ },
+ top_k = {
+ type = "integer",
+ minimum = 1,
+ maximum = 100,
+ default = 1,
+ description = "Number of nearest-neighbor candidates the index returns; "
+ .. "the first candidate above similarity_threshold is used.",
+ },
+ ttl = {
+ type = "integer",
+ minimum = 1,
+ default = 86400,
+ description = "Time-to-live in seconds for semantic-layer entries.",
+ },
+ embedding = embedding_schema,
+ },
+ required = { "embedding" },
+}
+
+local exact_schema = {
+ type = "object",
+ properties = {
+ ttl = {
+ type = "integer",
+ minimum = 1,
+ default = 3600,
+ description = "Time-to-live in seconds for exact-layer entries.",
+ },
+ },
+}
+
+
+local bypass_item_schema = {
+ type = "object",
+ properties = {
+ header = {
+ type = "string",
+ description = "Request header name to inspect.",
+ },
+ equals = {
+ type = "string",
+ description = "Value to match against the header. "
+ .. "If equal, the request bypasses the cache.",
+ },
+ },
+ required = { "header", "equals" },
+}
+
+local headers_schema = {
+ type = "object",
+ properties = {
+ cache_status = {
+ type = "string",
+ default = "X-AI-Cache-Status",
+ description = "Response header name for cache status "
+ .. "(HIT-L1 / HIT-L2 / MISS / BYPASS).",
+ },
+ cache_similarity = {
+ type = "string",
+ default = "X-AI-Cache-Similarity",
+ description = "Response header name for the similarity score of a semantic-layer hit.",
+ },
+ cache_age = {
+ type = "string",
+ default = "X-AI-Cache-Age",
+ description = "Response header name for the age in seconds of an exact-layer hit.",
+ },
+ },
+}
+
+_M.schema = {
+ type = "object",
+ properties = {
+ layers = {
+ type = "array",
+ items = { type = "string", enum = { "exact", "semantic" } },
+ uniqueItems = true,
+ minItems = 1,
+ default = { "exact", "semantic" },
+ description = "Cache layers to enable, queried in order.",
+ },
+ cache_key = {
+ type = "object",
+ properties = {
+ include_consumer = {
+ type = "boolean",
+ default = false,
+ description = "If true, partition the cache by consumer name.",
+ },
+ include_vars = {
+ type = "array",
+ items = { type = "string" },
+ default = {},
+ description = "Additional ctx.var names included in the cache key, "
+ .. "for example [\"$http_x_tenant_id\"].",
+ },
+ },
+ },
+ exact = exact_schema,
+ semantic = semantic_schema,
+ bypass_on = {
+ type = "array",
+ items = bypass_item_schema,
+ description = "List of {header, equals} rules. "
+ .. "If any matches, the request bypasses the cache.",
+ },
+ headers = headers_schema,
+ max_cache_body_size = {
+ type = "integer",
+ minimum = 1,
+ default = 1048576,
+ description = "Maximum response size in bytes to write to cache. "
+ .. "Larger responses pass through but are not cached.",
+ },
+ },
+ allOf = { redis_schema.schema.redis },
+ encrypt_fields = { "semantic.embedding.api_key", "redis_password" },
+}
+
+return _M
diff --git a/apisix/plugins/ai-cache/semantic.lua b/apisix/plugins/ai-cache/semantic.lua
new file mode 100644
index 000000000000..6d84dbd28675
--- /dev/null
+++ b/apisix/plugins/ai-cache/semantic.lua
@@ -0,0 +1,212 @@
+--
+-- Licensed to the Apache Software Foundation (ASF) under one or more
+-- contributor license agreements. See the NOTICE file distributed with
+-- this work for additional information regarding copyright ownership.
+-- The ASF licenses this file to You under the Apache License, Version 2.0
+-- (the "License"); you may not use this file except in compliance with
+-- the License. You may obtain a copy of the License at
+--
+-- http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-- See the License for the specific language governing permissions and
+-- limitations under the License.
+--
+
+local apisix_redis = require("apisix.utils.redis")
+local uuid = require("resty.jit-uuid")
+local ffi = require("ffi")
+
+local ffi_new = ffi.new
+local ffi_string = ffi.string
+local ngx_time = ngx.time
+local tostring = tostring
+local tonumber = tonumber
+local type = type
+
+local _M = {}
+
+
+local function index_name(dim)
+ return "ai-cache-idx-" .. dim
+end
+
+
+local function key_prefix(dim)
+ return "ai-cache:l2:" .. dim .. ":"
+end
+
+local function pack_vector(vec)
+ local n = #vec
+ local buf = ffi_new("float[?]", n)
+ for i = 0, n - 1 do
+ buf[i] = vec[i + 1]
+ end
+ return ffi_string(buf, n * 4)
+end
+
+local index_ready = {}
+local index_unsupported = false
+
+local function ensure_index(red, dim)
+ if index_unsupported then
+ return nil, "RediSearch not supported on this Redis instance"
+ end
+
+ if index_ready[dim] then
+ return true
+ end
+
+ local _, err = red["FT.CREATE"](red,
+ index_name(dim),
+ "ON", "HASH",
+ "PREFIX", "1", key_prefix(dim),
+ "SCHEMA",
+ "embedding", "VECTOR", "HNSW", "6",
+ "TYPE", "FLOAT32",
+ "DIM", tostring(dim),
+ "DISTANCE_METRIC", "COSINE",
+ "scope", "TAG",
+ "created_at", "NUMERIC"
+ )
+
+ if err then
+ -- RediSearch module absent — latch and stop retrying on every request
+ if err:find("unknown command", 1, true)
+ or err:find("ERR unknown", 1, true) then
+ index_unsupported = true
+ return nil, "RediSearch not supported on this Redis instance: " .. err
+ end
+ if not err:find("already exists") then
+ return nil, "FT.CREATE failed: " .. err
+ end
+ end
+
+ index_ready[dim] = true
+ return true
+end
+
+
+function _M.search(conf, scope_hash, embedding_vec, threshold)
+ local red, err = apisix_redis.new(conf)
+ if not red then
+ return nil, nil, err
+ end
+
+ local ok, init_err = ensure_index(red, #embedding_vec)
+ if not ok then
+ red:close()
+ return nil, nil, init_err
+ end
+
+ local binary_vec = pack_vector(embedding_vec)
+ local top_k = (conf.semantic and conf.semantic.top_k) or 1
+ local top_k_str = tostring(top_k)
+
+ local query
+ if scope_hash == "" then
+ query = "*=>[KNN " .. top_k_str .. " @embedding $vec AS dist]"
+ else
+ query = "@scope:{" .. scope_hash .. "}=>[KNN " .. top_k_str
+ .. " @embedding $vec AS dist]"
+ end
+
+ local res, search_err = red["FT.SEARCH"](red,
+ index_name(#embedding_vec),
+ query,
+ "PARAMS", "2", "vec", binary_vec,
+ "SORTBY", "dist", "ASC",
+ "LIMIT", "0", top_k_str,
+ "RETURN", "2", "response", "dist",
+ "DIALECT", "2"
+ )
+
+ if search_err then
+ red:close()
+ -- index was dropped externally — invalidate so next call recreates
+ if search_err:find("Unknown Index name", 1, true) then
+ index_ready[#embedding_vec] = nil
+ end
+ return nil, nil, search_err
+ end
+
+ red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool)
+
+ if not res or res[1] == 0 then
+ return nil, nil, nil
+ end
+
+ -- RESP2: {count, key1, fields1, key2, fields2, ...}
+ -- Results are sorted by dist ASC. Iterate candidates and return the first
+ -- one whose similarity meets the threshold; skip candidates with missing
+ -- or corrupt fields.
+ for i = 3, #res, 2 do
+ local fields = res[i]
+ if type(fields) == "table" then
+ local response_text, dist
+ for j = 1, #fields, 2 do
+ if fields[j] == "response" then
+ response_text = fields[j + 1]
+ elseif fields[j] == "dist" then
+ dist = tonumber(fields[j + 1])
+ end
+ end
+
+ if response_text and dist then
+ local similarity = 1 - dist
+ if similarity >= threshold then
+ return response_text, similarity, nil
+ end
+ end
+ end
+ end
+
+ return nil, nil, nil
+end
+
+
+function _M.store(conf, scope_hash, embedding_vec, text, ttl)
+ local red, err = apisix_redis.new(conf)
+ if not red then
+ return err
+ end
+
+ local ok, init_err = ensure_index(red, #embedding_vec)
+ if not ok then
+ red:close()
+ return init_err
+ end
+
+ local binary_vec = pack_vector(embedding_vec)
+ local key = key_prefix(#embedding_vec) .. uuid.generate_v4()
+
+ -- HSET + EXPIRE wrapped in MULTI/EXEC so the entry is never written
+ -- without its TTL (which would orphan it in Redis forever).
+ local _, multi_err = red:multi()
+ if multi_err then
+ red:close()
+ return multi_err
+ end
+
+ red:hset(key,
+ "embedding", binary_vec,
+ "response", text,
+ "scope", scope_hash,
+ "created_at", tostring(ngx_time())
+ )
+ red:expire(key, ttl)
+
+ local results, exec_err = red:exec()
+ if not results then
+ red:close()
+ return exec_err
+ end
+
+ red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool)
+ return nil
+end
+
+
+return _M
diff --git a/apisix/plugins/prometheus/exporter.lua b/apisix/plugins/prometheus/exporter.lua
index ce89ca03302a..78ce1bac0bf5 100644
--- a/apisix/plugins/prometheus/exporter.lua
+++ b/apisix/plugins/prometheus/exporter.lua
@@ -160,6 +160,14 @@ function _M.http_init(prometheus_enabled_in_stream)
"llm_completion_tokens", "expire")
local llm_active_connections_exptime = core.table.try_read_attr(attr, "metrics",
"llm_active_connections", "expire")
+ local ai_cache_hits_exptime = core.table.try_read_attr(attr, "metrics",
+ "ai_cache_hits", "expire")
+ local ai_cache_misses_exptime = core.table.try_read_attr(attr, "metrics",
+ "ai_cache_misses", "expire")
+ local ai_cache_embedding_latency_exptime = core.table.try_read_attr(attr, "metrics",
+ "ai_cache_embedding_latency", "expire")
+ local ai_cache_embedding_failures_exptime = core.table.try_read_attr(attr, "metrics",
+ "ai_cache_embedding_failures", "expire")
prometheus = base_prometheus.init("prometheus-metrics", metric_prefix)
@@ -260,6 +268,35 @@ function _M.http_init(prometheus_enabled_in_stream)
unpack(extra_labels("llm_active_connections"))},
llm_active_connections_exptime)
+ metrics.ai_cache_hits = prometheus:counter("ai_cache_hits_total",
+ "AI cache hit count by layer",
+ {"route_id", "service_id", "consumer", "layer",
+ unpack(extra_labels("ai_cache_hits"))},
+ ai_cache_hits_exptime)
+
+ metrics.ai_cache_misses = prometheus:counter("ai_cache_misses_total",
+ "AI cache miss count",
+ {"route_id", "service_id", "consumer",
+ unpack(extra_labels("ai_cache_misses"))},
+ ai_cache_misses_exptime)
+
+ local ai_cache_embedding_latency_buckets = DEFAULT_BUCKETS
+ if attr and attr.ai_cache_embedding_latency_buckets then
+ ai_cache_embedding_latency_buckets = attr.ai_cache_embedding_latency_buckets
+ end
+ metrics.ai_cache_embedding_latency = prometheus:histogram("ai_cache_embedding_latency",
+ "AI cache embedding API call latency in milliseconds",
+ {"route_id", "service_id", "consumer", "provider",
+ unpack(extra_labels("ai_cache_embedding_latency"))},
+ ai_cache_embedding_latency_buckets,
+ ai_cache_embedding_latency_exptime)
+
+ metrics.ai_cache_embedding_failures = prometheus:counter("ai_cache_embedding_failures_total",
+ "AI cache embedding API call failure count",
+ {"route_id", "service_id", "consumer",
+ unpack(extra_labels("ai_cache_embedding_failures"))},
+ ai_cache_embedding_failures_exptime)
+
if prometheus_enabled_in_stream then
init_stream_metrics()
end
@@ -377,6 +414,35 @@ function _M.http_log(conf, ctx)
vars.request_type, vars.request_llm_model, vars.llm_model,
unpack(extra_labels("llm_completion_tokens", ctx))))
end
+
+ if ctx.ai_cache_status then
+ if ctx.ai_cache_status == "HIT-L1" then
+ metrics.ai_cache_hits:inc(1,
+ gen_arr(route_id, service_id, consumer_name, "l1",
+ unpack(extra_labels("ai_cache_hits", ctx))))
+ elseif ctx.ai_cache_status == "HIT-L2" then
+ metrics.ai_cache_hits:inc(1,
+ gen_arr(route_id, service_id, consumer_name, "l2",
+ unpack(extra_labels("ai_cache_hits", ctx))))
+ elseif ctx.ai_cache_status == "MISS" then
+ metrics.ai_cache_misses:inc(1,
+ gen_arr(route_id, service_id, consumer_name,
+ unpack(extra_labels("ai_cache_misses", ctx))))
+ end
+
+ if ctx.ai_cache_embedding_latency_ms then
+ metrics.ai_cache_embedding_latency:observe(ctx.ai_cache_embedding_latency_ms,
+ gen_arr(route_id, service_id, consumer_name,
+ ctx.ai_cache_embedding_provider or "",
+ unpack(extra_labels("ai_cache_embedding_latency", ctx))))
+ end
+
+ if ctx.ai_cache_embedding_failed then
+ metrics.ai_cache_embedding_failures:inc(1,
+ gen_arr(route_id, service_id, consumer_name,
+ unpack(extra_labels("ai_cache_embedding_failures", ctx))))
+ end
+ end
end
@@ -790,6 +856,7 @@ function _M.dec_llm_active_connections(ctx)
inc_llm_active_connections(ctx, -1)
end
+
function _M.get_prometheus()
return prometheus
end
diff --git a/conf/config.yaml.example b/conf/config.yaml.example
index ae7155a86b06..901774540d70 100644
--- a/conf/config.yaml.example
+++ b/conf/config.yaml.example
@@ -514,6 +514,7 @@ plugins: # plugin list (sorted by priority)
- ai-prompt-template # priority: 1071
- ai-prompt-decorator # priority: 1070
- ai-prompt-guard # priority: 1072
+ - ai-cache # priority: 1065
- ai-rag # priority: 1060
- ai-aws-content-moderation # priority: 1050
- ai-proxy-multi # priority: 1041
diff --git a/docs/en/latest/config.json b/docs/en/latest/config.json
index d24eacc3f8e9..c198826c7505 100644
--- a/docs/en/latest/config.json
+++ b/docs/en/latest/config.json
@@ -75,6 +75,7 @@
"plugins/ai-proxy-multi",
"plugins/ai-rate-limiting",
"plugins/ai-prompt-guard",
+ "plugins/ai-cache",
"plugins/ai-aws-content-moderation",
"plugins/ai-aliyun-content-moderation",
"plugins/ai-prompt-decorator",
diff --git a/docs/en/latest/plugins/ai-cache.md b/docs/en/latest/plugins/ai-cache.md
new file mode 100644
index 000000000000..523b727f836f
--- /dev/null
+++ b/docs/en/latest/plugins/ai-cache.md
@@ -0,0 +1,1194 @@
+---
+title: ai-cache
+keywords:
+ - Apache APISIX
+ - API Gateway
+ - Plugin
+ - ai-cache
+description: The ai-cache Plugin caches LLM responses in Redis so identical or semantically similar prompts are served from cache, reducing latency and upstream cost.
+---
+
+
+
+
+
+
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+## Description
+
+The `ai-cache` Plugin caches LLM responses in Redis so identical or semantically similar prompts are served from cache instead of incurring another upstream call. It supports two cache layers: an exact-match layer (`exact`) keyed by a hash of the prompt, and a semantic layer (`semantic`) that compares prompt embeddings via Redis Stack vector search. Either layer can be enabled independently, and when both are enabled a hit on the semantic layer backfills the exact layer so subsequent identical prompts return immediately.
+
+The Plugin should be used together with [ai-proxy](./ai-proxy.md) or [ai-proxy-multi](./ai-proxy-multi.md) on the same Route. The semantic layer requires Redis Stack with the RediSearch module and an embedding provider (OpenAI or Azure OpenAI). PRs for additional embedding providers are welcomed.
+
+## Plugin Attributes
+
+| Name | Type | Required | Default | Valid values | Description |
+| --- | --- | --- | --- | --- | --- |
+| `layers` | array[string] | False | `["exact", "semantic"]` | `"exact"`, `"semantic"` | Cache layers to enable, queried in order. |
+| `exact.ttl` | integer | False | `3600` | ≥ 1 | Time-to-live in seconds for exact-layer entries. |
+| `semantic.similarity_threshold` | number | False | `0.95` | 0–1 | Minimum cosine similarity required for a semantic-layer hit. |
+| `semantic.top_k` | integer | False | `1` | [1, 100] | Number of nearest-neighbor candidates the index returns; the first candidate above `similarity_threshold` is used. |
+| `semantic.ttl` | integer | False | `86400` | ≥ 1 | Time-to-live in seconds for semantic-layer entries. |
+| `semantic.embedding.provider` | string | True (if semantic enabled) | | `"openai"`, `"azure_openai"` | Embedding API provider. |
+| `semantic.embedding.endpoint` | string | True (if semantic enabled) | | | Embedding API endpoint URL. |
+| `semantic.embedding.api_key` | string | True (if semantic enabled) | | | API key for the embedding provider. Stored encrypted. |
+| `semantic.embedding.model` | string | False | | | Embedding model name. Sent in the request body for `provider: openai`; ignored for `provider: azure_openai` (Azure infers the model from the deployment URL). Uses provider default if omitted. |
+| `semantic.embedding.timeout` | integer | False | `5000` | [1, 600000] | HTTP request timeout in milliseconds for embedding API calls. |
+| `semantic.embedding.ssl_verify` | boolean | False | `true` | | Whether to verify the embedding endpoint's TLS certificate. |
+| `cache_key.include_consumer` | boolean | False | `false` | | If `true`, partition the cache by consumer name. |
+| `cache_key.include_vars` | array[string] | False | `[]` | | Additional `ctx.var` names included in the cache key, for example `["$http_x_tenant_id"]`. |
+| `bypass_on` | array[object] | False | | | List of `{header, equals}` rules. If any matches, the request bypasses the cache. |
+| `max_cache_body_size` | integer | False | `1048576` | ≥ 1 | Maximum response size in bytes to write to cache. Larger responses pass through but are not cached. |
+| `headers.cache_status` | string | False | `"X-AI-Cache-Status"` | | Response header for cache status (`HIT-L1`, `HIT-L2`, `MISS`, `BYPASS`). |
+| `headers.cache_age` | string | False | `"X-AI-Cache-Age"` | | Response header for the age in seconds of an exact-layer hit. |
+| `headers.cache_similarity` | string | False | `"X-AI-Cache-Similarity"` | | Response header for the similarity score of a semantic-layer hit. |
+| `redis_host` | string | True | | | The address of the Redis node. |
+| `redis_port` | integer | False | `6379` | [1,...] | The port of the Redis node. |
+| `redis_username` | string | False | | | The username for Redis if Redis ACL is used. If you use the legacy authentication method `requirepass`, configure only the `redis_password`. |
+| `redis_password` | string | False | | | The password of the Redis node. |
+| `redis_database` | integer | False | `0` | >= 0 | The database number in Redis. |
+| `redis_timeout` | integer | False | `1000` | [1,...] | The Redis timeout value in milliseconds. |
+| `redis_ssl` | boolean | False | `false` | | If `true`, use SSL to connect to Redis. |
+| `redis_ssl_verify` | boolean | False | `false` | | If `true`, verify the server SSL certificate. |
+| `redis_keepalive_timeout` | integer | False | `10000` | [1000,...] | Idle timeout in milliseconds for the Redis connection in the keepalive pool. |
+| `redis_keepalive_pool` | integer | False | `100` | [1,...] | Maximum number of idle Redis connections kept in the keepalive pool. |
+
+## Examples
+
+The following examples use OpenAI as the Upstream service provider. Before proceeding, create an [OpenAI account](https://openai.com) and an [API key](https://openai.com/blog/openai-api). You can optionally save the key to an environment variable:
+
+```shell
+export OPENAI_API_KEY=
+```
+
+If you are working with other LLM providers, please refer to the provider's documentation to obtain an API key.
+
+:::note
+
+You can fetch the `admin_key` from `config.yaml` and save to an environment variable with the following command:
+
+```shell
+admin_key=$(yq '.deployment.admin.admin_key[0].key' conf/config.yaml | sed 's/"//g')
+```
+
+:::
+
+### Cache Identical Prompts with the Exact Layer
+
+The following example demonstrates how to use the `ai-cache` Plugin with the exact layer only, so that identical prompts are returned from cache.
+
+
+
+
+Create a Route that uses [ai-proxy](./ai-proxy.md) to proxy to OpenAI and `ai-cache` to cache exact-match prompts:
+
+```shell
+curl "http://127.0.0.1:9180/apisix/admin/routes/1" -X PUT \
+ -H "X-API-KEY: ${admin_key}" \
+ -d '{
+ "uri": "/anything",
+ "methods": ["POST"],
+ "plugins": {
+ "ai-proxy": {
+ "provider": "openai",
+ "auth": {
+ "header": {
+ "Authorization": "Bearer '"$OPENAI_API_KEY"'"
+ }
+ },
+ "options": {
+ "model": "gpt-4"
+ }
+ },
+ "ai-cache": {
+ "layers": ["exact"],
+ "exact": { "ttl": 3600 },
+ "redis_host": "127.0.0.1"
+ }
+ }
+ }'
+```
+
+
+
+
+Create a Route with the `ai-cache` and [ai-proxy](./ai-proxy.md) Plugins configured as such:
+
+```yaml title="adc.yaml"
+services:
+ - name: ai-cache-service
+ routes:
+ - name: ai-cache-route
+ uris:
+ - /anything
+ methods:
+ - POST
+ plugins:
+ ai-proxy:
+ provider: openai
+ auth:
+ header:
+ Authorization: "Bearer ${OPENAI_API_KEY}"
+ options:
+ model: gpt-4
+ ai-cache:
+ layers:
+ - exact
+ exact:
+ ttl: 3600
+ redis_host: 127.0.0.1
+```
+
+Synchronize the configuration to the gateway:
+
+```shell
+adc sync -f adc.yaml
+```
+
+
+
+
+
+
+
+Create a Route with the `ai-cache` and [ai-proxy](./ai-proxy.md) Plugins configured as such:
+
+```yaml title="ai-cache-ic.yaml"
+apiVersion: apisix.apache.org/v1alpha1
+kind: PluginConfig
+metadata:
+ namespace: aic
+ name: ai-cache-plugin-config
+spec:
+ plugins:
+ - name: ai-proxy
+ config:
+ provider: openai
+ auth:
+ header:
+ Authorization: "Bearer your-api-key"
+ options:
+ model: gpt-4
+ - name: ai-cache
+ config:
+ layers:
+ - exact
+ exact:
+ ttl: 3600
+ redis_host: redis-stack
+---
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+ namespace: aic
+ name: ai-cache-route
+spec:
+ parentRefs:
+ - name: apisix
+ rules:
+ - matches:
+ - path:
+ type: Exact
+ value: /anything
+ method: POST
+ filters:
+ - type: ExtensionRef
+ extensionRef:
+ group: apisix.apache.org
+ kind: PluginConfig
+ name: ai-cache-plugin-config
+```
+
+
+
+
+Create a Route with the `ai-cache` and [ai-proxy](./ai-proxy.md) Plugins configured as such:
+
+```yaml title="ai-cache-ic.yaml"
+apiVersion: apisix.apache.org/v2
+kind: ApisixRoute
+metadata:
+ namespace: aic
+ name: ai-cache-route
+spec:
+ ingressClassName: apisix
+ http:
+ - name: ai-cache-route
+ match:
+ paths:
+ - /anything
+ methods:
+ - POST
+ plugins:
+ - name: ai-proxy
+ enable: true
+ config:
+ provider: openai
+ auth:
+ header:
+ Authorization: "Bearer your-api-key"
+ options:
+ model: gpt-4
+ - name: ai-cache
+ enable: true
+ config:
+ layers:
+ - exact
+ exact:
+ ttl: 3600
+ redis_host: redis-stack
+```
+
+
+
+
+Apply the configuration to your cluster:
+
+```shell
+kubectl apply -f ai-cache-ic.yaml
+```
+
+
+
+
+Send a request to the Route:
+
+```shell
+curl -i "http://127.0.0.1:9080/anything" -X POST \
+ -H "Content-Type: application/json" \
+ -d '{
+ "messages": [
+ { "role": "user", "content": "What is the capital of France?" }
+ ]
+ }'
+```
+
+The first request reaches OpenAI. Note the `X-AI-Cache-Status: MISS` header, indicating the prompt was not in cache and APISIX forwarded the request upstream:
+
+```text
+HTTP/1.1 200 OK
+Content-Type: application/json
+Server: APISIX/3.16.0
+X-AI-Cache-Status: MISS
+
+{
+ "id": "chatcmpl-Da7Iqsqz9gc8Mkf07Hn4NCzAH5Ri1",
+ "object": "chat.completion",
+ "created": 1777500252,
+ "model": "gpt-4o-mini-2024-07-18",
+ "choices": [
+ {
+ "index": 0,
+ "message": {
+ "role": "assistant",
+ "content": "The capital of France is Paris.",
+ "refusal": null
+ },
+ "finish_reason": "stop"
+ }
+ ],
+ "usage": {
+ "prompt_tokens": 14,
+ "completion_tokens": 7,
+ "total_tokens": 21
+ },
+ "system_fingerprint": "fp_d3214ccada"
+}
+```
+
+Send the same request again:
+
+```shell
+curl -i "http://127.0.0.1:9080/anything" -X POST \
+ -H "Content-Type: application/json" \
+ -d '{
+ "messages": [
+ { "role": "user", "content": "What is the capital of France?" }
+ ]
+ }'
+```
+
+The second request returns from cache without contacting OpenAI. The `X-AI-Cache-Status: HIT-L1` header signals an exact-match hit and `X-AI-Cache-Age` reports the entry's age in seconds. The cached response is replayed from Redis, so the body is shorter and does not contain the original `created`, `model`, `usage`, or `system_fingerprint` fields:
+
+```text
+HTTP/1.1 200 OK
+Content-Type: application/json
+Server: APISIX/3.16.0
+X-AI-Cache-Status: HIT-L1
+X-AI-Cache-Age: 4
+
+{
+ "id": "f558665e-3a03-42e3-9aa9-f54c402927c0",
+ "object": "chat.completion",
+ "choices": [
+ {
+ "index": 0,
+ "message": {
+ "content": "The capital of France is Paris.",
+ "role": "assistant"
+ },
+ "finish_reason": "stop"
+ }
+ ]
+}
+```
+
+### Cache Paraphrased Prompts with the Semantic Layer
+
+The following example demonstrates how to enable the semantic layer so that prompts with different wording but similar meaning are served from cache.
+
+
+
+
+```shell
+curl "http://127.0.0.1:9180/apisix/admin/routes/1" -X PUT \
+ -H "X-API-KEY: ${admin_key}" \
+ -d '{
+ "uri": "/anything",
+ "methods": ["POST"],
+ "plugins": {
+ "ai-proxy": {
+ "provider": "openai",
+ "auth": {
+ "header": {
+ "Authorization": "Bearer '"$OPENAI_API_KEY"'"
+ }
+ },
+ "options": {
+ "model": "gpt-4"
+ }
+ },
+ "ai-cache": {
+ "layers": ["exact", "semantic"],
+ "exact": { "ttl": 3600 },
+ "semantic": {
+ "similarity_threshold": 0.85,
+ "ttl": 86400,
+ "embedding": {
+ "provider": "openai",
+ "endpoint": "https://api.openai.com/v1/embeddings",
+ "api_key": "'"$OPENAI_API_KEY"'",
+ "model": "text-embedding-3-small"
+ }
+ },
+ "redis_host": "127.0.0.1"
+ }
+ }
+ }'
+```
+
+
+
+
+```yaml title="adc.yaml"
+services:
+ - name: ai-cache-service
+ routes:
+ - name: ai-cache-route
+ uris:
+ - /anything
+ methods:
+ - POST
+ plugins:
+ ai-proxy:
+ provider: openai
+ auth:
+ header:
+ Authorization: "Bearer ${OPENAI_API_KEY}"
+ options:
+ model: gpt-4
+ ai-cache:
+ layers:
+ - exact
+ - semantic
+ exact:
+ ttl: 3600
+ semantic:
+ similarity_threshold: 0.85
+ ttl: 86400
+ embedding:
+ provider: openai
+ endpoint: https://api.openai.com/v1/embeddings
+ api_key: "${OPENAI_API_KEY}"
+ model: text-embedding-3-small
+ redis_host: 127.0.0.1
+```
+
+Synchronize the configuration to the gateway:
+
+```shell
+adc sync -f adc.yaml
+```
+
+
+
+
+
+
+
+```yaml title="ai-cache-ic.yaml"
+apiVersion: apisix.apache.org/v1alpha1
+kind: PluginConfig
+metadata:
+ namespace: aic
+ name: ai-cache-plugin-config
+spec:
+ plugins:
+ - name: ai-proxy
+ config:
+ provider: openai
+ auth:
+ header:
+ Authorization: "Bearer your-api-key"
+ options:
+ model: gpt-4
+ - name: ai-cache
+ config:
+ layers:
+ - exact
+ - semantic
+ exact:
+ ttl: 3600
+ semantic:
+ similarity_threshold: 0.85
+ ttl: 86400
+ embedding:
+ provider: openai
+ endpoint: https://api.openai.com/v1/embeddings
+ api_key: your-api-key
+ model: text-embedding-3-small
+ redis_host: redis-stack
+---
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+ namespace: aic
+ name: ai-cache-route
+spec:
+ parentRefs:
+ - name: apisix
+ rules:
+ - matches:
+ - path:
+ type: Exact
+ value: /anything
+ method: POST
+ filters:
+ - type: ExtensionRef
+ extensionRef:
+ group: apisix.apache.org
+ kind: PluginConfig
+ name: ai-cache-plugin-config
+```
+
+
+
+
+```yaml title="ai-cache-ic.yaml"
+apiVersion: apisix.apache.org/v2
+kind: ApisixRoute
+metadata:
+ namespace: aic
+ name: ai-cache-route
+spec:
+ ingressClassName: apisix
+ http:
+ - name: ai-cache-route
+ match:
+ paths:
+ - /anything
+ methods:
+ - POST
+ plugins:
+ - name: ai-proxy
+ enable: true
+ config:
+ provider: openai
+ auth:
+ header:
+ Authorization: "Bearer your-api-key"
+ options:
+ model: gpt-4
+ - name: ai-cache
+ enable: true
+ config:
+ layers:
+ - exact
+ - semantic
+ exact:
+ ttl: 3600
+ semantic:
+ similarity_threshold: 0.85
+ ttl: 86400
+ embedding:
+ provider: openai
+ endpoint: https://api.openai.com/v1/embeddings
+ api_key: your-api-key
+ model: text-embedding-3-small
+ redis_host: redis-stack
+```
+
+
+
+
+Apply the configuration to your cluster:
+
+```shell
+kubectl apply -f ai-cache-ic.yaml
+```
+
+
+
+
+Send a first request:
+
+```shell
+curl -i "http://127.0.0.1:9080/anything" -X POST \
+ -H "Content-Type: application/json" \
+ -d '{
+ "messages": [
+ { "role": "user", "content": "What is the capital city of China?" }
+ ]
+ }'
+```
+
+The first request reaches OpenAI with `X-AI-Cache-Status: MISS`:
+
+```text
+HTTP/1.1 200 OK
+Content-Type: application/json
+Server: APISIX/3.16.0
+X-AI-Cache-Status: MISS
+
+{
+ "id": "chatcmpl-DcCIDs6ZJisclo84FUk5fT2Ks5vzn",
+ "object": "chat.completion",
+ "model": "gpt-4-0613",
+ "choices": [
+ {
+ "index": 0,
+ "message": {
+ "role": "assistant",
+ "content": "The capital city of China is Beijing."
+ },
+ "finish_reason": "stop"
+ }
+ ],
+ "usage": { "prompt_tokens": 15, "completion_tokens": 8, "total_tokens": 23 }
+}
+```
+
+Wait a couple of seconds for the semantic-layer write to complete in the background, then send a second request with paraphrased wording:
+
+```shell
+curl -i "http://127.0.0.1:9080/anything" -X POST \
+ -H "Content-Type: application/json" \
+ -d '{
+ "messages": [
+ { "role": "user", "content": "Capital city of China?" }
+ ]
+ }'
+```
+
+The semantic layer matches the embedding (cosine similarity above the threshold) and returns the cached response without contacting OpenAI. The `X-AI-Cache-Status: HIT-L2` header signals a semantic-layer hit and `X-AI-Cache-Similarity` reports the cosine similarity score:
+
+```text
+HTTP/1.1 200 OK
+Content-Type: application/json
+Server: APISIX/3.16.0
+X-AI-Cache-Status: HIT-L2
+X-AI-Cache-Similarity: 0.9065774679184
+
+{
+ "id": "a95488bb-4a51-491a-bd5b-2c1d0e5f8a9b",
+ "object": "chat.completion",
+ "choices": [
+ {
+ "index": 0,
+ "message": {
+ "content": "The capital city of China is Beijing.",
+ "role": "assistant"
+ },
+ "finish_reason": "stop"
+ }
+ ]
+}
+```
+
+When the `exact` layer is also enabled (as in this example), a semantic-layer hit backfills it, so an immediate retry of the same paraphrase returns `X-AI-Cache-Status: HIT-L1`.
+
+### Isolate Cache Entries Per Consumer or Tenant
+
+The following example demonstrates how to namespace cache entries so that one consumer's response is not served to another. Use `cache_key.include_consumer` to partition by consumer name, or `cache_key.include_vars` to include request variables such as a tenant header.
+
+
+
+
+```shell
+curl "http://127.0.0.1:9180/apisix/admin/routes/1" -X PUT \
+ -H "X-API-KEY: ${admin_key}" \
+ -d '{
+ "uri": "/anything",
+ "methods": ["POST"],
+ "plugins": {
+ "ai-proxy": {
+ "provider": "openai",
+ "auth": {
+ "header": {
+ "Authorization": "Bearer '"$OPENAI_API_KEY"'"
+ }
+ },
+ "options": {
+ "model": "gpt-4"
+ }
+ },
+ "ai-cache": {
+ "layers": ["exact"],
+ "exact": { "ttl": 3600 },
+ "cache_key": {
+ "include_consumer": true,
+ "include_vars": ["$http_x_tenant_id"]
+ },
+ "redis_host": "127.0.0.1"
+ }
+ }
+ }'
+```
+
+
+
+
+```yaml title="adc.yaml"
+services:
+ - name: ai-cache-service
+ routes:
+ - name: ai-cache-route
+ uris:
+ - /anything
+ methods:
+ - POST
+ plugins:
+ ai-proxy:
+ provider: openai
+ auth:
+ header:
+ Authorization: "Bearer ${OPENAI_API_KEY}"
+ options:
+ model: gpt-4
+ ai-cache:
+ layers:
+ - exact
+ exact:
+ ttl: 3600
+ cache_key:
+ include_consumer: true
+ include_vars:
+ - "$http_x_tenant_id"
+ redis_host: 127.0.0.1
+```
+
+Synchronize the configuration to the gateway:
+
+```shell
+adc sync -f adc.yaml
+```
+
+
+
+
+
+
+
+```yaml title="ai-cache-ic.yaml"
+apiVersion: apisix.apache.org/v1alpha1
+kind: PluginConfig
+metadata:
+ namespace: aic
+ name: ai-cache-plugin-config
+spec:
+ plugins:
+ - name: ai-proxy
+ config:
+ provider: openai
+ auth:
+ header:
+ Authorization: "Bearer your-api-key"
+ options:
+ model: gpt-4
+ - name: ai-cache
+ config:
+ layers:
+ - exact
+ exact:
+ ttl: 3600
+ cache_key:
+ include_consumer: true
+ include_vars:
+ - "$http_x_tenant_id"
+ redis_host: redis-stack
+---
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+ namespace: aic
+ name: ai-cache-route
+spec:
+ parentRefs:
+ - name: apisix
+ rules:
+ - matches:
+ - path:
+ type: Exact
+ value: /anything
+ method: POST
+ filters:
+ - type: ExtensionRef
+ extensionRef:
+ group: apisix.apache.org
+ kind: PluginConfig
+ name: ai-cache-plugin-config
+```
+
+
+
+
+```yaml title="ai-cache-ic.yaml"
+apiVersion: apisix.apache.org/v2
+kind: ApisixRoute
+metadata:
+ namespace: aic
+ name: ai-cache-route
+spec:
+ ingressClassName: apisix
+ http:
+ - name: ai-cache-route
+ match:
+ paths:
+ - /anything
+ methods:
+ - POST
+ plugins:
+ - name: ai-proxy
+ enable: true
+ config:
+ provider: openai
+ auth:
+ header:
+ Authorization: "Bearer your-api-key"
+ options:
+ model: gpt-4
+ - name: ai-cache
+ enable: true
+ config:
+ layers:
+ - exact
+ exact:
+ ttl: 3600
+ cache_key:
+ include_consumer: true
+ include_vars:
+ - "$http_x_tenant_id"
+ redis_host: redis-stack
+```
+
+
+
+
+Apply the configuration to your cluster:
+
+```shell
+kubectl apply -f ai-cache-ic.yaml
+```
+
+
+
+
+Send a first request as `tenant-a`:
+
+```shell
+curl -i "http://127.0.0.1:9080/anything" -X POST \
+ -H "Content-Type: application/json" \
+ -H "X-Tenant-Id: tenant-a" \
+ -d '{
+ "messages": [
+ { "role": "user", "content": "What is the capital city of Japan?" }
+ ]
+ }'
+```
+
+The first request reaches OpenAI with `X-AI-Cache-Status: MISS` and primes `tenant-a`'s cache scope:
+
+```text
+HTTP/1.1 200 OK
+Content-Type: application/json
+Server: APISIX/3.16.0
+X-AI-Cache-Status: MISS
+
+{
+ "id": "chatcmpl-DcCRAzeSsimIOIeLQWsKtDxMLAAhu",
+ "object": "chat.completion",
+ "model": "gpt-4-0613",
+ "choices": [
+ {
+ "index": 0,
+ "message": {
+ "role": "assistant",
+ "content": "The capital city of Japan is Tokyo."
+ },
+ "finish_reason": "stop"
+ }
+ ],
+ "usage": { "prompt_tokens": 15, "completion_tokens": 8, "total_tokens": 23 }
+}
+```
+
+Repeat the same prompt as `tenant-a`:
+
+```shell
+curl -i "http://127.0.0.1:9080/anything" -X POST \
+ -H "Content-Type: application/json" \
+ -H "X-Tenant-Id: tenant-a" \
+ -d '{
+ "messages": [
+ { "role": "user", "content": "What is the capital city of Japan?" }
+ ]
+ }'
+```
+
+The second request returns from cache without contacting OpenAI. The `X-AI-Cache-Status: HIT-L1` header confirms `tenant-a`'s entry was reused:
+
+```text
+HTTP/1.1 200 OK
+Content-Type: application/json
+Server: APISIX/3.16.0
+X-AI-Cache-Status: HIT-L1
+X-AI-Cache-Age: 6
+
+{
+ "id": "6be4f7a2-83f1-4cdc-8654-cee0396bd4f3",
+ "object": "chat.completion",
+ "choices": [
+ {
+ "index": 0,
+ "message": {
+ "content": "The capital city of Japan is Tokyo.",
+ "role": "assistant"
+ },
+ "finish_reason": "stop"
+ }
+ ]
+}
+```
+
+Send the same prompt as a different tenant, `tenant-b`:
+
+```shell
+curl -i "http://127.0.0.1:9080/anything" -X POST \
+ -H "Content-Type: application/json" \
+ -H "X-Tenant-Id: tenant-b" \
+ -d '{
+ "messages": [
+ { "role": "user", "content": "What is the capital city of Japan?" }
+ ]
+ }'
+```
+
+Even though the prompt is identical, the request reaches OpenAI with `X-AI-Cache-Status: MISS` because `tenant-b` has its own cache scope:
+
+```text
+HTTP/1.1 200 OK
+Content-Type: application/json
+Server: APISIX/3.16.0
+X-AI-Cache-Status: MISS
+
+{
+ "id": "chatcmpl-DcCROH92JLWcgyhSpwEoutTvqnew5",
+ "object": "chat.completion",
+ "model": "gpt-4-0613",
+ "choices": [
+ {
+ "index": 0,
+ "message": {
+ "role": "assistant",
+ "content": "The capital city of Japan is Tokyo."
+ },
+ "finish_reason": "stop"
+ }
+ ],
+ "usage": { "prompt_tokens": 15, "completion_tokens": 8, "total_tokens": 23 }
+}
+```
+
+### Bypass the Cache on a Header
+
+The following example demonstrates how to skip the cache entirely when a request carries a specific header, for example to refresh a cached response or to support staff debugging.
+
+
+
+
+```shell
+curl "http://127.0.0.1:9180/apisix/admin/routes/1" -X PUT \
+ -H "X-API-KEY: ${admin_key}" \
+ -d '{
+ "uri": "/anything",
+ "methods": ["POST"],
+ "plugins": {
+ "ai-proxy": {
+ "provider": "openai",
+ "auth": {
+ "header": {
+ "Authorization": "Bearer '"$OPENAI_API_KEY"'"
+ }
+ },
+ "options": {
+ "model": "gpt-4"
+ }
+ },
+ "ai-cache": {
+ "layers": ["exact"],
+ "exact": { "ttl": 3600 },
+ "bypass_on": [
+ { "header": "X-Cache-Bypass", "equals": "1" }
+ ],
+ "redis_host": "127.0.0.1"
+ }
+ }
+ }'
+```
+
+
+
+
+```yaml title="adc.yaml"
+services:
+ - name: ai-cache-service
+ routes:
+ - name: ai-cache-route
+ uris:
+ - /anything
+ methods:
+ - POST
+ plugins:
+ ai-proxy:
+ provider: openai
+ auth:
+ header:
+ Authorization: "Bearer ${OPENAI_API_KEY}"
+ options:
+ model: gpt-4
+ ai-cache:
+ layers:
+ - exact
+ exact:
+ ttl: 3600
+ bypass_on:
+ - header: X-Cache-Bypass
+ equals: "1"
+ redis_host: 127.0.0.1
+```
+
+Synchronize the configuration to the gateway:
+
+```shell
+adc sync -f adc.yaml
+```
+
+
+
+
+
+
+
+```yaml title="ai-cache-ic.yaml"
+apiVersion: apisix.apache.org/v1alpha1
+kind: PluginConfig
+metadata:
+ namespace: aic
+ name: ai-cache-plugin-config
+spec:
+ plugins:
+ - name: ai-proxy
+ config:
+ provider: openai
+ auth:
+ header:
+ Authorization: "Bearer your-api-key"
+ options:
+ model: gpt-4
+ - name: ai-cache
+ config:
+ layers:
+ - exact
+ exact:
+ ttl: 3600
+ bypass_on:
+ - header: X-Cache-Bypass
+ equals: "1"
+ redis_host: redis-stack
+---
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+ namespace: aic
+ name: ai-cache-route
+spec:
+ parentRefs:
+ - name: apisix
+ rules:
+ - matches:
+ - path:
+ type: Exact
+ value: /anything
+ method: POST
+ filters:
+ - type: ExtensionRef
+ extensionRef:
+ group: apisix.apache.org
+ kind: PluginConfig
+ name: ai-cache-plugin-config
+```
+
+
+
+
+```yaml title="ai-cache-ic.yaml"
+apiVersion: apisix.apache.org/v2
+kind: ApisixRoute
+metadata:
+ namespace: aic
+ name: ai-cache-route
+spec:
+ ingressClassName: apisix
+ http:
+ - name: ai-cache-route
+ match:
+ paths:
+ - /anything
+ methods:
+ - POST
+ plugins:
+ - name: ai-proxy
+ enable: true
+ config:
+ provider: openai
+ auth:
+ header:
+ Authorization: "Bearer your-api-key"
+ options:
+ model: gpt-4
+ - name: ai-cache
+ enable: true
+ config:
+ layers:
+ - exact
+ exact:
+ ttl: 3600
+ bypass_on:
+ - header: X-Cache-Bypass
+ equals: "1"
+ redis_host: redis-stack
+```
+
+
+
+
+Apply the configuration to your cluster:
+
+```shell
+kubectl apply -f ai-cache-ic.yaml
+```
+
+
+
+
+Send a request with the bypass header:
+
+```shell
+curl -i "http://127.0.0.1:9080/anything" -X POST \
+ -H "Content-Type: application/json" \
+ -H "X-Cache-Bypass: 1" \
+ -d '{
+ "messages": [
+ { "role": "user", "content": "What is the capital of France?" }
+ ]
+ }'
+```
+
+The request reaches OpenAI even though a cached entry exists, and the response is not written back to cache. The `X-AI-Cache-Status: BYPASS` header confirms the cache was skipped, and the response includes the original `created`, `model`, `usage`, and `system_fingerprint` fields, verifying the upstream was contacted:
+
+```text
+HTTP/1.1 200 OK
+Content-Type: application/json
+Server: APISIX/3.16.0
+X-AI-Cache-Status: BYPASS
+
+{
+ "id": "chatcmpl-Da7N4E9fA6KoQ7av98hL0zxplPCcD",
+ "object": "chat.completion",
+ "created": 1777500514,
+ "model": "gpt-4o-mini-2024-07-18",
+ "choices": [
+ {
+ "index": 0,
+ "message": {
+ "role": "assistant",
+ "content": "The capital of France is Paris.",
+ "refusal": null
+ },
+ "finish_reason": "stop"
+ }
+ ],
+ "usage": {
+ "prompt_tokens": 14,
+ "completion_tokens": 7,
+ "total_tokens": 21
+ },
+ "system_fingerprint": "fp_d3214ccada"
+}
+```
+
+## Caveats
+
+### The semantic-layer write is asynchronous
+
+After a `MISS`, the embedding fetch and Redis vector store happen in a background timer. If you send a paraphrased prompt immediately after the first request, you may see another `MISS` because the entry has not been stored yet. Wait a couple of seconds before sending a paraphrase to verify a semantic hit.
+
+### Similarity is mathematical, not human-judged
+
+Two prompts that look semantically equivalent to a human can score below the configured `similarity_threshold` and therefore miss the cache. Conversely, a small wording change can flip the result. For example, with `similarity_threshold` set to `0.85` and the cache primed with `"What is the capital of France?"`:
+
+| Prompt | Status | Similarity |
+|--------|--------|------------|
+| `capital of France?` | `HIT-L2` | `0.850` |
+| `capital of France what?` | `MISS` | (below threshold) |
+| `capital of France what is?` | `HIT-L2` | `0.972` |
+| `capital of France what please?` | `HIT-L2` | `0.924` |
+| `capital of France what is please tell me?` | `MISS` | (below threshold) |
+
+Lower the threshold to catch more paraphrases at the cost of occasionally serving a cached answer for a genuinely different question. Tune empirically against your traffic.
+
+### `BYPASS` does not refresh the cache
+
+A request with the bypass header reaches the upstream but its response is not written back. Use it to force a fresh upstream call without invalidating or replacing the existing cached entry.
+
+The bypass header is not authenticated — any client that can set the configured header and value can bypass the cache. In production, gate access using an APISIX authentication plugin such as `key-auth` or `ip-restriction`, or restrict the header at your upstream WAF.
diff --git a/t/admin/plugins.t b/t/admin/plugins.t
index adb98b28bc17..1454ec145eb0 100644
--- a/t/admin/plugins.t
+++ b/t/admin/plugins.t
@@ -98,6 +98,7 @@ ai-request-rewrite
ai-prompt-guard
ai-prompt-template
ai-prompt-decorator
+ai-cache
ai-rag
ai-aws-content-moderation
ai-proxy-multi
diff --git a/t/plugin/ai-cache-scope.t b/t/plugin/ai-cache-scope.t
new file mode 100644
index 000000000000..21facdcf4688
--- /dev/null
+++ b/t/plugin/ai-cache-scope.t
@@ -0,0 +1,384 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+BEGIN {
+ $ENV{TEST_ENABLE_CONTROL_API_V1} = "0";
+}
+
+use t::APISIX 'no_plan';
+
+log_level("info");
+repeat_each(1);
+no_long_string();
+no_shuffle();
+no_root_location();
+
+add_block_preprocessor(sub {
+ my ($block) = @_;
+
+ if (!defined $block->request) {
+ $block->set_value("request", "GET /t");
+ }
+
+ if (!$block->error_log && !$block->no_error_log) {
+ $block->set_value("no_error_log", "[error]\n[alert]");
+ }
+
+ if (!defined $block->http_config) {
+ $block->set_value("http_config", <<_EOC_);
+server {
+ listen 1990;
+ default_type 'application/json';
+
+ location /v1/embeddings {
+ content_by_lua_block {
+ local fixture_loader = require("lib.fixture_loader")
+ local content, err = fixture_loader.load("openai/embeddings-list.json")
+ if not content then
+ ngx.status = 500
+ ngx.say(err)
+ return
+ end
+
+ ngx.status = 200
+ ngx.print(content)
+ }
+ }
+}
+_EOC_
+ }
+});
+
+run_tests();
+
+__DATA__
+
+=== TEST 1: set up route with cache_key include_vars
+--- config
+ location /t {
+ content_by_lua_block {
+ local t = require("lib.test_admin").test
+ local code, body = t('/apisix/admin/routes/1',
+ ngx.HTTP_PUT,
+ [[{
+ "uri": "/scoped",
+ "plugins": {
+ "ai-proxy": {
+ "provider": "openai",
+ "auth": {
+ "header": {
+ "Authorization": "Bearer test-key"
+ }
+ },
+ "override": {
+ "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+ }
+ },
+ "ai-cache": {
+ "layers": ["exact"],
+ "exact": { "ttl": 60 },
+ "cache_key": {
+ "include_vars": ["$http_x_tenant_id"]
+ },
+ "redis_host": "127.0.0.1"
+ }
+ }
+ }]]
+ )
+
+ if code >= 300 then
+ ngx.status = code
+ end
+ ngx.say(body)
+ }
+ }
+--- response_body
+passed
+
+
+
+=== TEST 2: tenant-a first request - MISS
+--- request
+POST /scoped
+{"messages":[{"role":"user","content":"scope test prompt"}]}
+--- more_headers
+Content-Type: application/json
+X-Tenant-Id: tenant-a
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+
+
+
+=== TEST 3: tenant-b same prompt - MISS (proves cache_key partitioning)
+--- request
+POST /scoped
+{"messages":[{"role":"user","content":"scope test prompt"}]}
+--- more_headers
+Content-Type: application/json
+X-Tenant-Id: tenant-b
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- wait: 1
+
+
+
+=== TEST 4: tenant-a same prompt again - HIT-L1
+--- request
+POST /scoped
+{"messages":[{"role":"user","content":"scope test prompt"}]}
+--- more_headers
+Content-Type: application/json
+X-Tenant-Id: tenant-a
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L1
+
+
+
+=== TEST 5: set up consumers for include_consumer test
+--- config
+ location /t {
+ content_by_lua_block {
+ local t = require("lib.test_admin").test
+
+ local consumers = {
+ { username = "alice", key = "alice-key" },
+ { username = "bob", key = "bob-key" },
+ }
+
+ for _, c in ipairs(consumers) do
+ local code, body = t('/apisix/admin/consumers',
+ ngx.HTTP_PUT,
+ string.format([[{
+ "username": "%s",
+ "plugins": { "key-auth": { "key": "%s" } }
+ }]], c.username, c.key)
+ )
+ if code >= 300 then
+ ngx.status = code
+ ngx.say(body)
+ return
+ end
+ end
+ ngx.say("passed")
+ }
+ }
+--- response_body
+passed
+
+
+
+=== TEST 6: set up route with cache_key include_consumer + key-auth
+--- config
+ location /t {
+ content_by_lua_block {
+ local t = require("lib.test_admin").test
+ local code, body = t('/apisix/admin/routes/2',
+ ngx.HTTP_PUT,
+ [[{
+ "uri": "/per-consumer",
+ "plugins": {
+ "key-auth": {},
+ "ai-proxy": {
+ "provider": "openai",
+ "auth": {
+ "header": {
+ "Authorization": "Bearer test-key"
+ }
+ },
+ "override": {
+ "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+ }
+ },
+ "ai-cache": {
+ "layers": ["exact"],
+ "exact": { "ttl": 60 },
+ "cache_key": {
+ "include_consumer": true
+ },
+ "redis_host": "127.0.0.1"
+ }
+ }
+ }]]
+ )
+
+ if code >= 300 then
+ ngx.status = code
+ end
+ ngx.say(body)
+ }
+ }
+--- response_body
+passed
+
+
+
+=== TEST 7: alice first request - MISS
+--- request
+POST /per-consumer
+{"messages":[{"role":"user","content":"per-consumer prompt"}]}
+--- more_headers
+Content-Type: application/json
+apikey: alice-key
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- wait: 1
+
+
+
+=== TEST 8: bob same prompt - MISS (proves include_consumer partitioning)
+--- request
+POST /per-consumer
+{"messages":[{"role":"user","content":"per-consumer prompt"}]}
+--- more_headers
+Content-Type: application/json
+apikey: bob-key
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- wait: 1
+
+
+
+=== TEST 9: bob same prompt again - HIT-L1 (proves bob has own cache)
+--- request
+POST /per-consumer
+{"messages":[{"role":"user","content":"per-consumer prompt"}]}
+--- more_headers
+Content-Type: application/json
+apikey: bob-key
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L1
+
+
+
+=== TEST 10: set up route with L2 semantic + cache_key include_vars
+--- config
+ location /t {
+ content_by_lua_block {
+ local t = require("lib.test_admin").test
+ local code, body = t('/apisix/admin/routes/3',
+ ngx.HTTP_PUT,
+ [[{
+ "uri": "/scoped-semantic",
+ "plugins": {
+ "ai-proxy": {
+ "provider": "openai",
+ "auth": {
+ "header": {
+ "Authorization": "Bearer test-key"
+ }
+ },
+ "override": {
+ "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+ }
+ },
+ "ai-cache": {
+ "layers": ["exact", "semantic"],
+ "exact": { "ttl": 60 },
+ "semantic": {
+ "similarity_threshold": 0.90,
+ "ttl": 300,
+ "embedding": {
+ "provider": "openai",
+ "endpoint": "http://127.0.0.1:1990/v1/embeddings",
+ "api_key": "test-key"
+ }
+ },
+ "cache_key": {
+ "include_vars": ["$http_x_tenant_id"]
+ },
+ "redis_host": "127.0.0.1"
+ }
+ }
+ }]]
+ )
+
+ if code >= 300 then
+ ngx.status = code
+ end
+ ngx.say(body)
+ }
+ }
+--- response_body
+passed
+
+
+
+=== TEST 11: tenant-a first request - MISS, writes to L2 with scope=hash(tenant-a)
+--- request
+POST /scoped-semantic
+{"messages":[{"role":"user","content":"What is the capital of France??"}]}
+--- more_headers
+Content-Type: application/json
+X-Tenant-Id: tenant-a
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- wait: 1
+
+
+
+=== TEST 12: tenant-b same prompt - MISS (FT.SEARCH scope filter excludes tenant-a's entry)
+--- request
+POST /scoped-semantic
+{"messages":[{"role":"user","content":"What is the capital of France??"}]}
+--- more_headers
+Content-Type: application/json
+X-Tenant-Id: tenant-b
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- wait: 1
+
+
+
+=== TEST 13: tenant-a paraphrase - HIT-L2 (scope filter finds tenant-a's entry)
+--- request
+POST /scoped-semantic
+{"messages":[{"role":"user","content":"Name the capital city of France"}]}
+--- more_headers
+Content-Type: application/json
+X-Tenant-Id: tenant-a
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L2
+
+
+
+=== TEST 14: tenant-b paraphrase - HIT-L2 (proves tenant-b has own L2 entry)
+--- request
+POST /scoped-semantic
+{"messages":[{"role":"user","content":"Name the capital city of France"}]}
+--- more_headers
+Content-Type: application/json
+X-Tenant-Id: tenant-b
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L2
diff --git a/t/plugin/ai-cache.t b/t/plugin/ai-cache.t
new file mode 100644
index 000000000000..abc652328ac8
--- /dev/null
+++ b/t/plugin/ai-cache.t
@@ -0,0 +1,818 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+BEGIN {
+ $ENV{TEST_ENABLE_CONTROL_API_V1} = "0";
+}
+
+use t::APISIX 'no_plan';
+
+log_level("info");
+repeat_each(1);
+no_long_string();
+no_shuffle();
+no_root_location();
+
+add_block_preprocessor(sub {
+ my ($block) = @_;
+
+ if (!defined $block->request) {
+ $block->set_value("request", "GET /t");
+ }
+
+ if (!$block->error_log && !$block->no_error_log) {
+ $block->set_value("no_error_log", "[error]\n[alert]");
+ }
+
+ if (!defined $block->http_config) {
+ $block->set_value("http_config", <<_EOC_);
+server {
+ listen 1990;
+ default_type 'application/json';
+
+ location /v1/embeddings {
+ content_by_lua_block {
+ local fixture_loader = require("lib.fixture_loader")
+ local content, err = fixture_loader.load("openai/embeddings-list.json")
+ if not content then
+ ngx.status = 500
+ ngx.say(err)
+ return
+ end
+
+ ngx.status = 200
+ ngx.print(content)
+ }
+ }
+}
+_EOC_
+ }
+});
+
+run_tests();
+
+__DATA__
+
+=== TEST 1: valid config - exact layer only
+--- config
+ location /t {
+ content_by_lua_block {
+ local plugin = require("apisix.plugins.ai-cache")
+ local ok, err = plugin.check_schema({
+ layers = { "exact" },
+ exact = { ttl = 600 },
+ redis_host = "127.0.0.1",
+ redis_port = 6379,
+ })
+
+ if not ok then
+ ngx.say("failed")
+ else
+ ngx.say("passed")
+ end
+ }
+ }
+--- response_body
+passed
+
+
+
+=== TEST 2: valid config - both layers with semantic embedding
+--- config
+ location /t {
+ content_by_lua_block {
+ local plugin = require("apisix.plugins.ai-cache")
+ local ok, err = plugin.check_schema({
+ layers = { "exact", "semantic" },
+ exact = { ttl = 3600 },
+ semantic = {
+ similarity_threshold = 0.95,
+ ttl = 86400,
+ embedding = {
+ provider = "openai",
+ endpoint = "https://api.openai.com/v1/embeddings",
+ api_key = "sk-test",
+ },
+ },
+ redis_host = "127.0.0.1",
+ redis_port = 6379,
+ })
+
+ if not ok then
+ ngx.say(err)
+ else
+ ngx.say("passed")
+ end
+ }
+ }
+--- response_body
+passed
+
+
+
+=== TEST 3: semantic without embedding config - should fail
+--- config
+ location /t {
+ content_by_lua_block {
+ local plugin = require("apisix.plugins.ai-cache")
+ local ok, err = plugin.check_schema({
+ layers = { "semantic" },
+ redis_host = "127.0.0.1",
+ })
+
+ if not ok then
+ ngx.say("failed: ", err)
+ else
+ ngx.say("passed")
+ end
+ }
+ }
+--- response_body
+failed: semantic layer requires semantic.embedding to be configured
+
+
+
+=== TEST 4: invalid layer value - should fail
+--- config
+ location /t {
+ content_by_lua_block {
+ local plugin = require("apisix.plugins.ai-cache")
+ local ok, err = plugin.check_schema({
+ layers = { "invalid_layer" },
+ })
+
+ if not ok then
+ ngx.say(err)
+ else
+ ngx.say("passed")
+ end
+ }
+ }
+--- response_body eval
+qr/.*property "layers" validation failed:.*matches none of the enum values.*/
+
+
+
+=== TEST 5: unsupported embedding provider - should fail
+--- config
+ location /t {
+ content_by_lua_block {
+ local plugin = require("apisix.plugins.ai-cache")
+ local ok, err = plugin.check_schema({
+ layers = { "semantic" },
+ semantic = {
+ embedding = {
+ provider = "some-unknown-provider",
+ endpoint = "https://example.com/embeddings",
+ api_key = "key",
+ },
+ },
+ })
+
+ if not ok then
+ ngx.say(err)
+ else
+ ngx.say("passed")
+ end
+ }
+ }
+--- response_body eval
+qr/.*property "provider" validation failed: matches none of the enum values.*/
+
+
+
+=== TEST 6: similarity_threshold out of range - should fail
+--- config
+ location /t {
+ content_by_lua_block {
+ local plugin = require("apisix.plugins.ai-cache")
+ local ok, err = plugin.check_schema({
+ layers = { "semantic" },
+ semantic = {
+ similarity_threshold = 1.5,
+ embedding = {
+ provider = "openai",
+ endpoint = "https://api.openai.com/v1/embeddings",
+ api_key = "sk-test",
+ },
+ },
+ })
+
+ if not ok then
+ ngx.say(err)
+ else
+ ngx.say("passed")
+ end
+ }
+ }
+--- response_body eval
+qr/.*property "similarity_threshold" validation failed: expected 1\.5 to be at most.*/
+
+
+
+=== TEST 7: layers empty array - should fail (minItems=1)
+--- config
+ location /t {
+ content_by_lua_block {
+ local plugin = require("apisix.plugins.ai-cache")
+ local ok, err = plugin.check_schema({
+ layers = {},
+ redis_host = "127.0.0.1",
+ })
+
+ if not ok then
+ ngx.say(err)
+ else
+ ngx.say("passed")
+ end
+ }
+ }
+--- response_body eval
+qr/.*property "layers" validation failed: expect array to have at least 1 items.*/
+
+
+
+=== TEST 8: set up route for L1 cache tests
+--- config
+ location /t {
+ content_by_lua_block {
+ local t = require("lib.test_admin").test
+ local code, body = t('/apisix/admin/routes/1',
+ ngx.HTTP_PUT,
+ [[{
+ "uri": "/exact",
+ "plugins": {
+ "ai-proxy": {
+ "provider": "openai",
+ "auth": {
+ "header": {
+ "Authorization": "Bearer test-key"
+ }
+ },
+ "override": {
+ "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+ }
+ },
+ "ai-cache": {
+ "layers": ["exact"],
+ "exact": { "ttl": 60 },
+ "redis_host": "127.0.0.1",
+ "bypass_on": [{"header": "X-Cache-Bypass", "equals": "1"}]
+ }
+ }
+ }]]
+ )
+
+ if code >= 300 then
+ ngx.status = code
+ end
+ ngx.say(body)
+ }
+ }
+--- response_body
+passed
+
+
+
+=== TEST 9: first request - cache MISS, upstream called
+--- request
+POST /exact
+{"messages":[{"role":"user","content":"What is the answer to life?"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- response_body_like eval
+qr/\{ "content": "1 \+ 1 = 2\.", "role": "assistant" \}/
+
+
+
+=== TEST 10: second identical request - cache HIT-L1, no upstream call
+--- request
+POST /exact
+{"messages":[{"role":"user","content":"What is the answer to life?"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L1
+--- response_headers_like
+X-AI-Cache-Age: \d+
+--- response_body_like eval
+qr/"content":\s?"1 \+ 1 = 2\."/
+--- error_log
+ai-cache: L1 hit for key
+
+
+
+=== TEST 11: bypass header - BYPASS, upstream called, not cached
+--- request
+POST /exact
+{"messages":[{"role":"user","content":"What is the bypass question?"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+X-Cache-Bypass: 1
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: BYPASS
+
+
+
+=== TEST 12: same prompt without bypass after bypass - still MISS (bypass did not cache)
+--- request
+POST /exact
+{"messages":[{"role":"user","content":"What is the bypass question?"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+
+
+
+=== TEST 13: set up route with two bypass rules
+--- config
+ location /t {
+ content_by_lua_block {
+ local t = require("lib.test_admin").test
+ local code, body = t('/apisix/admin/routes/1',
+ ngx.HTTP_PUT,
+ [[{
+ "uri": "/exact",
+ "plugins": {
+ "ai-proxy": {
+ "provider": "openai",
+ "auth": {
+ "header": {
+ "Authorization": "Bearer test-key"
+ }
+ },
+ "override": {
+ "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+ }
+ },
+ "ai-cache": {
+ "layers": ["exact"],
+ "exact": { "ttl": 60 },
+ "redis_host": "127.0.0.1",
+ "bypass_on": [
+ {"header": "X-Cache-Bypass", "equals": "1"},
+ {"header": "X-Debug", "equals": "true"}
+ ]
+ }
+ }
+ }]]
+ )
+
+ if code >= 300 then
+ ngx.status = code
+ end
+ ngx.say(body)
+ }
+ }
+--- response_body
+passed
+
+
+
+=== TEST 14: first bypass rule matches - BYPASS
+--- request
+POST /exact
+{"messages":[{"role":"user","content":"multi-rule bypass test"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+X-Cache-Bypass: 1
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: BYPASS
+
+
+
+=== TEST 15: second bypass rule matches - BYPASS
+--- request
+POST /exact
+{"messages":[{"role":"user","content":"multi-rule bypass test"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+X-Debug: true
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: BYPASS
+
+
+
+=== TEST 16: set up route for upstream-status filter tests
+--- config
+ location /t {
+ content_by_lua_block {
+ local t = require("lib.test_admin").test
+ local code, body = t('/apisix/admin/routes/2',
+ ngx.HTTP_PUT,
+ [[{
+ "uri": "/error",
+ "plugins": {
+ "ai-proxy": {
+ "provider": "openai",
+ "auth": {
+ "header": {
+ "Authorization": "Bearer test-key"
+ }
+ },
+ "override": {
+ "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+ }
+ },
+ "ai-cache": {
+ "layers": ["exact"],
+ "exact": { "ttl": 60 },
+ "redis_host": "127.0.0.1"
+ }
+ }
+ }]]
+ )
+
+ if code >= 300 then
+ ngx.status = code
+ end
+ ngx.say(body)
+ }
+ }
+--- response_body
+passed
+
+
+
+=== TEST 17: non-2xx upstream response - not cached (status code filter)
+--- request
+POST /error
+{"messages":[{"role":"user","content":"trigger a server error"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+X-AI-Fixture-Status: 500
+--- error_code: 500
+--- response_headers
+X-AI-Cache-Status: MISS
+
+
+
+=== TEST 18: same prompt after non-2xx - still MISS (was not cached)
+--- request
+POST /error
+{"messages":[{"role":"user","content":"trigger a server error"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+X-AI-Fixture-Status: 500
+--- error_code: 500
+--- response_headers
+X-AI-Cache-Status: MISS
+
+
+
+=== TEST 19: set up route with very small max_cache_body_size
+--- config
+ location /t {
+ content_by_lua_block {
+ local t = require("lib.test_admin").test
+ local code, body = t('/apisix/admin/routes/3',
+ ngx.HTTP_PUT,
+ [[{
+ "uri": "/tiny",
+ "plugins": {
+ "ai-proxy": {
+ "provider": "openai",
+ "auth": {
+ "header": {
+ "Authorization": "Bearer test-key"
+ }
+ },
+ "override": {
+ "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+ }
+ },
+ "ai-cache": {
+ "layers": ["exact"],
+ "exact": { "ttl": 60 },
+ "max_cache_body_size": 5,
+ "redis_host": "127.0.0.1"
+ }
+ }
+ }]]
+ )
+
+ if code >= 300 then
+ ngx.status = code
+ end
+ ngx.say(body)
+ }
+ }
+--- response_body
+passed
+
+
+
+=== TEST 20: oversize response - MISS, log warns and skips cache write
+--- request
+POST /tiny
+{"messages":[{"role":"user","content":"oversize body test"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- response_body_like eval
+qr/\{ "content": "1 \+ 1 = 2\.", "role": "assistant" \}/
+--- error_log
+exceeds max_cache_body_size
+
+
+
+=== TEST 21: same prompt after oversize - still MISS (was not cached)
+--- request
+POST /tiny
+{"messages":[{"role":"user","content":"oversize body test"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- response_body_like eval
+qr/\{ "content": "1 \+ 1 = 2\.", "role": "assistant" \}/
+--- error_log
+exceeds max_cache_body_size
+
+
+
+=== TEST 22: set up route with custom cache header names
+--- config
+ location /t {
+ content_by_lua_block {
+ local t = require("lib.test_admin").test
+ local code, body = t('/apisix/admin/routes/4',
+ ngx.HTTP_PUT,
+ [[{
+ "uri": "/custom-headers",
+ "plugins": {
+ "ai-proxy": {
+ "provider": "openai",
+ "auth": {
+ "header": {
+ "Authorization": "Bearer test-key"
+ }
+ },
+ "override": {
+ "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+ }
+ },
+ "ai-cache": {
+ "layers": ["exact"],
+ "exact": { "ttl": 60 },
+ "headers": {
+ "cache_status": "X-Custom-Status",
+ "cache_age": "X-Custom-Age"
+ },
+ "redis_host": "127.0.0.1"
+ }
+ }
+ }]]
+ )
+
+ if code >= 300 then
+ ngx.status = code
+ end
+ ngx.say(body)
+ }
+ }
+--- response_body
+passed
+
+
+
+=== TEST 23: MISS populates the cache and emits custom status header
+--- request
+POST /custom-headers
+{"messages":[{"role":"user","content":"custom header test"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-Custom-Status: MISS
+--- response_body_like eval
+qr/\{ "content": "1 \+ 1 = 2\.", "role": "assistant" \}/
+--- wait: 1
+
+
+
+=== TEST 24: HIT emits custom status and age headers (defaults not used)
+--- request
+POST /custom-headers
+{"messages":[{"role":"user","content":"custom header test"}]}
+--- more_headers
+Content-Type: application/json
+--- error_code: 200
+--- response_headers
+X-Custom-Status: HIT-L1
+X-AI-Cache-Status:
+X-AI-Cache-Age:
+--- response_headers_like
+X-Custom-Age: \d+
+--- response_body_like eval
+qr/"content":\s?"1 \+ 1 = 2\."/
+
+
+
+=== TEST 25: clean up Redis cache state before semantic tests
+--- config
+ location /t {
+ content_by_lua_block {
+ local redis = require("resty.redis")
+ local red = redis:new()
+ red:set_timeout(1000)
+ assert(red:connect("127.0.0.1", 6379))
+
+ red["FT.DROPINDEX"](red, "ai-cache-idx-3", "DD")
+
+ local keys = red:keys("ai-cache:*")
+ if type(keys) == "table" and #keys > 0 then
+ red:del(unpack(keys))
+ end
+
+ red:close()
+ ngx.say("ok")
+ }
+ }
+--- response_body
+ok
+
+
+
+=== TEST 26: set up route for L2 semantic cache tests
+--- config
+ location /t {
+ content_by_lua_block {
+ local t = require("lib.test_admin").test
+ local code, body = t('/apisix/admin/routes/5',
+ ngx.HTTP_PUT,
+ [[{
+ "uri": "/semantic",
+ "plugins": {
+ "ai-proxy": {
+ "provider": "openai",
+ "auth": {
+ "header": {
+ "Authorization": "Bearer test-key"
+ }
+ },
+ "override": {
+ "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+ }
+ },
+ "ai-cache": {
+ "layers": ["exact", "semantic"],
+ "exact": {
+ "ttl": 60
+ },
+ "semantic": {
+ "similarity_threshold": 0.90,
+ "ttl": 300,
+ "embedding": {
+ "provider": "openai",
+ "endpoint": "http://127.0.0.1:1990/v1/embeddings",
+ "api_key": "test-key"
+ }
+ },
+ "redis_host": "127.0.0.1"
+ }
+ }
+ }]]
+ )
+ if code >= 300 then
+ ngx.status = code
+ end
+ ngx.say(body)
+ }
+ }
+--- response_body
+passed
+
+
+
+=== TEST 27: L2 - first request, cache MISS, stored in L2
+--- request
+POST /semantic
+{"messages":[{"role":"user","content":"What is the capital of France??"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- response_body_like eval
+qr/\{ "content": "1 \+ 1 = 2\.", "role": "assistant" \}/
+
+
+
+=== TEST 28: L2 - different wording hits L2 (same vector from fixture)
+--- request
+POST /semantic
+{"messages":[{"role":"user","content":"Name the capital city of France"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L2
+--- response_headers_like
+X-AI-Cache-Similarity: \d+(\.\d+)?
+--- response_body_like eval
+qr/"content":\s?"1 \+ 1 = 2\."/
+--- error_log
+ai-cache: L2 hit
+
+
+
+=== TEST 29: L2 - paraphrase now hits L1 (backfilled by the previous L2 hit)
+--- request
+POST /semantic
+{"messages":[{"role":"user","content":"Name the capital city of France"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L1
+--- response_body_like eval
+qr/"content":\s?"1 \+ 1 = 2\."/
+--- error_log
+ai-cache: L1 hit for key
+
+
+
+=== TEST 30: streaming MISS - upstream called, response cached via log phase
+--- request
+POST /exact
+{"messages":[{"role":"user","content":"Stream me something cool"}],"stream":true}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-streaming.sse
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- response_body_like eval
+qr/data:.*"content":"Hello"/
+
+
+
+=== TEST 31: streaming HIT - Content-Type is text/event-stream, SSE body returned
+--- request
+POST /exact
+{"messages":[{"role":"user","content":"Stream me something cool"}],"stream":true}
+--- more_headers
+Content-Type: application/json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L1
+Content-Type: text/event-stream
+--- response_body_like eval
+qr/data:.*"content":\s?"Hello!"/
+--- wait: 1
+
+
+
+=== TEST 32: non-streaming HIT after streaming MISS - returns JSON
+--- request
+POST /exact
+{"messages":[{"role":"user","content":"Stream me something cool"}]}
+--- more_headers
+Content-Type: application/json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L1
+Content-Type: application/json
+--- response_body_like eval
+qr/"content":\s?"Hello!"/
diff --git a/t/plugin/prometheus-ai-cache.t b/t/plugin/prometheus-ai-cache.t
new file mode 100644
index 000000000000..3af1a6ae2491
--- /dev/null
+++ b/t/plugin/prometheus-ai-cache.t
@@ -0,0 +1,481 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+BEGIN {
+ $ENV{TEST_ENABLE_CONTROL_API_V1} = "0";
+
+ if ($ENV{TEST_NGINX_CHECK_LEAK}) {
+ $SkipReason = "unavailable for the hup tests";
+ } else {
+ $ENV{TEST_NGINX_USE_HUP} = 1;
+ undef $ENV{TEST_NGINX_USE_STAP};
+ }
+}
+
+use t::APISIX 'no_plan';
+
+repeat_each(1);
+no_long_string();
+no_shuffle();
+no_root_location();
+
+add_block_preprocessor(sub {
+ my ($block) = @_;
+
+ if (!defined $block->request) {
+ $block->set_value("request", "GET /t");
+ }
+
+ my $user_yaml_config = <<_EOC_;
+plugin_attr:
+ prometheus:
+ refresh_interval: 0.1
+plugins:
+ - ai-proxy
+ - ai-cache
+ - prometheus
+ - public-api
+ - key-auth
+_EOC_
+ $block->set_value("extra_yaml_config", $user_yaml_config);
+
+ if (!defined $block->http_config) {
+ $block->set_value("http_config", <<_EOC_);
+server {
+ listen 1990;
+ default_type 'application/json';
+
+ location /v1/embeddings {
+ content_by_lua_block {
+ local fixture_loader = require("lib.fixture_loader")
+ local content, err = fixture_loader.load("openai/embeddings-list.json")
+ if not content then
+ ngx.status = 500
+ ngx.say(err)
+ return
+ end
+
+ ngx.status = 200
+ ngx.print(content)
+ }
+ }
+
+ location /v1/embeddings-fail {
+ content_by_lua_block {
+ ngx.status = 500
+ ngx.say('{"error":"simulated embedding failure"}')
+ }
+ }
+}
+_EOC_
+ }
+});
+
+run_tests;
+
+__DATA__
+
+=== TEST 1: set up routes
+--- config
+ location /t {
+ content_by_lua_block {
+ local t = require("lib.test_admin").test
+
+ local routes = {
+ {
+ url = "/apisix/admin/routes/1",
+ data = [[{
+ "uri": "/exact",
+ "plugins": {
+ "prometheus": {},
+ "ai-proxy": {
+ "provider": "openai",
+ "auth": {
+ "header": {
+ "Authorization": "Bearer test-key"
+ }
+ },
+ "override": {
+ "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+ }
+ },
+ "ai-cache": {
+ "layers": ["exact"],
+ "exact": { "ttl": 60 },
+ "redis_host": "127.0.0.1",
+ "bypass_on": [{"header": "X-Cache-Bypass", "equals": "1"}]
+ }
+ }
+ }]],
+ },
+ {
+ url = "/apisix/admin/routes/2",
+ data = [[{
+ "uri": "/semantic",
+ "plugins": {
+ "prometheus": {},
+ "ai-proxy": {
+ "provider": "openai",
+ "auth": {
+ "header": {
+ "Authorization": "Bearer test-key"
+ }
+ },
+ "override": {
+ "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+ }
+ },
+ "ai-cache": {
+ "layers": ["exact", "semantic"],
+ "exact": { "ttl": 60 },
+ "semantic": {
+ "similarity_threshold": 0.90,
+ "ttl": 300,
+ "embedding": {
+ "provider": "openai",
+ "endpoint": "http://127.0.0.1:1990/v1/embeddings",
+ "api_key": "test-key"
+ }
+ },
+ "redis_host": "127.0.0.1"
+ }
+ }
+ }]],
+ },
+ {
+ url = "/apisix/admin/routes/3",
+ data = [[{
+ "uri": "/semantic-fail",
+ "plugins": {
+ "prometheus": {},
+ "ai-proxy": {
+ "provider": "openai",
+ "auth": {
+ "header": {
+ "Authorization": "Bearer test-key"
+ }
+ },
+ "override": {
+ "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+ }
+ },
+ "ai-cache": {
+ "layers": ["semantic"],
+ "semantic": {
+ "similarity_threshold": 0.90,
+ "ttl": 300,
+ "embedding": {
+ "provider": "openai",
+ "endpoint": "http://127.0.0.1:1990/v1/embeddings-fail",
+ "api_key": "test-key"
+ }
+ },
+ "redis_host": "127.0.0.1"
+ }
+ }
+ }]],
+ },
+ {
+ url = "/apisix/admin/routes/4",
+ data = [[{
+ "uri": "/exact-auth",
+ "plugins": {
+ "prometheus": {},
+ "key-auth": {},
+ "ai-proxy": {
+ "provider": "openai",
+ "auth": {
+ "header": {
+ "Authorization": "Bearer test-key"
+ }
+ },
+ "override": {
+ "endpoint": "http://127.0.0.1:1980/v1/chat/completions"
+ }
+ },
+ "ai-cache": {
+ "layers": ["exact"],
+ "exact": { "ttl": 60 },
+ "redis_host": "127.0.0.1"
+ }
+ }
+ }]],
+ },
+ {
+ url = "/apisix/admin/consumers",
+ data = [[{
+ "username": "alice",
+ "plugins": {
+ "key-auth": {
+ "key": "alice-key"
+ }
+ }
+ }]],
+ },
+ {
+ url = "/apisix/admin/routes/metrics",
+ data = [[{
+ "plugins": {
+ "public-api": {}
+ },
+ "uri": "/apisix/prometheus/metrics"
+ }]],
+ },
+ }
+
+ for _, route in ipairs(routes) do
+ local code, body = t(route.url, ngx.HTTP_PUT, route.data)
+ if code >= 300 then
+ ngx.status = code
+ end
+ ngx.say(body)
+ end
+ }
+ }
+--- response_body eval
+"passed\n" x 6
+
+
+
+=== TEST 2: MISS request - upstream called
+--- request
+POST /exact
+{"messages":[{"role":"user","content":"What is the meaning of life?"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+
+
+
+=== TEST 3: same request - HIT-L1
+--- request
+POST /exact
+{"messages":[{"role":"user","content":"What is the meaning of life?"}]}
+--- more_headers
+Content-Type: application/json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L1
+--- wait: 1
+
+
+
+=== TEST 4: verify miss counter
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_misses_total\{route_id="1",service_id="",consumer=""\} 1/
+
+
+
+=== TEST 5: verify hit counter with layer label
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_hits_total\{route_id="1",service_id="",consumer="",layer="l1"\} 1/
+
+
+
+=== TEST 6: BYPASS request - upstream called, no cache interaction
+--- request
+POST /exact
+{"messages":[{"role":"user","content":"What is the meaning of life?"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+X-Cache-Bypass: 1
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: BYPASS
+
+
+
+=== TEST 7: verify BYPASS did not increment misses counter
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_misses_total\{route_id="1",service_id="",consumer=""\} 1\n/
+
+
+
+=== TEST 8: verify BYPASS did not increment hits counter
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_hits_total\{route_id="1",service_id="",consumer="",layer="l1"\} 1\n/
+
+
+
+=== TEST 9: cleanup Redis cache state before semantic tests
+--- config
+ location /t {
+ content_by_lua_block {
+ local redis = require("resty.redis")
+ local red = redis:new()
+ red:set_timeout(1000)
+ assert(red:connect("127.0.0.1", 6379))
+
+ red["FT.DROPINDEX"](red, "ai-cache-idx-3", "DD")
+
+ local keys = red:keys("ai-cache:*")
+ if type(keys) == "table" and #keys > 0 then
+ red:del(unpack(keys))
+ end
+
+ red:close()
+ ngx.say("ok")
+ }
+ }
+--- response_body
+ok
+
+
+
+=== TEST 10: L2 first request - MISS, embedding API called
+--- request
+POST /semantic
+{"messages":[{"role":"user","content":"What is the capital of France??"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- wait: 1
+
+
+
+=== TEST 11: L2 second request - different wording, HIT-L2
+--- request
+POST /semantic
+{"messages":[{"role":"user","content":"Name the capital city of France"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L2
+--- wait: 1
+
+
+
+=== TEST 12: verify miss counter for semantic route (route_id=2)
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_misses_total\{route_id="2",service_id="",consumer=""\} 1/
+
+
+
+=== TEST 13: verify hits counter with layer="l2"
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_hits_total\{route_id="2",service_id="",consumer="",layer="l2"\} 1/
+
+
+
+=== TEST 14: verify embedding latency histogram with provider label
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_embedding_latency_count\{route_id="2",service_id="",consumer="",provider="openai"\} 2/
+
+
+
+=== TEST 15: embedding failure - request still returns 200 via fallback
+--- request
+POST /semantic-fail
+{"messages":[{"role":"user","content":"What does this fail at?"}]}
+--- more_headers
+Content-Type: application/json
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+--- wait: 1
+
+
+
+=== TEST 16: verify embedding_failures counter
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_embedding_failures_total\{route_id="3",service_id="",consumer=""\} 1/
+
+
+
+=== TEST 17: verify embedding-failure request also counted as miss
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_misses_total\{route_id="3",service_id="",consumer=""\} 1/
+
+
+
+=== TEST 18: authenticated MISS request - consumer alice
+--- request
+POST /exact-auth
+{"messages":[{"role":"user","content":"Authenticated cache test"}]}
+--- more_headers
+Content-Type: application/json
+apikey: alice-key
+X-AI-Fixture: openai/chat-basic.json
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: MISS
+
+
+
+=== TEST 19: authenticated HIT-L1 request - consumer alice
+--- request
+POST /exact-auth
+{"messages":[{"role":"user","content":"Authenticated cache test"}]}
+--- more_headers
+Content-Type: application/json
+apikey: alice-key
+--- error_code: 200
+--- response_headers
+X-AI-Cache-Status: HIT-L1
+--- wait: 1
+
+
+
+=== TEST 20: verify consumer label is populated on hits counter
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_hits_total\{route_id="4",service_id="",consumer="alice",layer="l1"\} 1/
+
+
+
+=== TEST 21: verify consumer label is populated on misses counter
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_ai_cache_misses_total\{route_id="4",service_id="",consumer="alice"\} 1/
+
+
+
+=== TEST 22: verify cache hit is labelled as ai_chat (not traditional_http)
+--- request
+GET /apisix/prometheus/metrics
+--- response_body_like eval
+qr/apisix_http_status\{code="200",route="1"[^}]*request_type="ai_chat"[^}]*response_source="apisix"[^}]*\} 1/