diff --git a/Makefile b/Makefile index 71ab7df1eabf..ad074ecabbdf 100644 --- a/Makefile +++ b/Makefile @@ -388,6 +388,11 @@ install: runtime $(ENV_INSTALL) -d $(ENV_INST_LUADIR)/apisix/plugins/ai-transport $(ENV_INSTALL) apisix/plugins/ai-transport/*.lua $(ENV_INST_LUADIR)/apisix/plugins/ai-transport + $(ENV_INSTALL) -d $(ENV_INST_LUADIR)/apisix/plugins/ai-cache + $(ENV_INSTALL) apisix/plugins/ai-cache/*.lua $(ENV_INST_LUADIR)/apisix/plugins/ai-cache + $(ENV_INSTALL) -d $(ENV_INST_LUADIR)/apisix/plugins/ai-cache/embeddings + $(ENV_INSTALL) apisix/plugins/ai-cache/embeddings/*.lua $(ENV_INST_LUADIR)/apisix/plugins/ai-cache/embeddings + $(ENV_INSTALL) -d $(ENV_INST_LUADIR)/apisix/plugins/ai-rag/embeddings $(ENV_INSTALL) apisix/plugins/ai-rag/embeddings/*.lua $(ENV_INST_LUADIR)/apisix/plugins/ai-rag/embeddings $(ENV_INSTALL) -d $(ENV_INST_LUADIR)/apisix/plugins/ai-rag/vector-search diff --git a/apisix/cli/config.lua b/apisix/cli/config.lua index 956eef30c267..b4df580666c0 100644 --- a/apisix/cli/config.lua +++ b/apisix/cli/config.lua @@ -231,6 +231,7 @@ local _M = { "ai-prompt-template", "ai-prompt-decorator", "ai-prompt-guard", + "ai-cache", "ai-rag", "ai-rate-limiting", "ai-proxy-multi", diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua new file mode 100644 index 000000000000..476bd785d85a --- /dev/null +++ b/apisix/plugins/ai-cache.lua @@ -0,0 +1,296 @@ +-- +-- Licensed to the Apache Software Foundation (ASF) under one or more +-- contributor license agreements. See the NOTICE file distributed with +-- this work for additional information regarding copyright ownership. +-- The ASF licenses this file to You under the Apache License, Version 2.0 +-- (the "License"); you may not use this file except in compliance with +-- the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. +-- + +local core = require("apisix.core") +local schema = require("apisix.plugins.ai-cache.schema") +local exact = require("apisix.plugins.ai-cache.exact") +local semantic = require("apisix.plugins.ai-cache.semantic") +local protocols = require("apisix.plugins.ai-protocols") +local http = require("resty.http") +local ngx = ngx +local ngx_time = ngx.time +local ngx_now = ngx.now +local ipairs = ipairs +local require = require +local tostring = tostring +local table_concat = table.concat + +local plugin_name = "ai-cache" + +local _M = { + version = 0.1, + priority = 1065, + name = plugin_name, + schema = schema.schema +} + + +local function layer_enabled(conf, name) + local layers = conf.layers or { "exact", "semantic" } + for _, l in ipairs(layers) do + if l == name then return true end + end + return false +end + + +local function populate_ai_ctx_on_hit(ctx, protocol_name, body_tab, is_stream, cached_text) + ctx.ai_client_protocol = protocol_name + ctx.var.request_type = is_stream and "ai_stream" or "ai_chat" + if body_tab.model then + ctx.var.request_llm_model = body_tab.model + ctx.var.llm_model = body_tab.model + end + ctx.var.llm_response_text = cached_text +end + + +function _M.check_schema(conf) + local ok, err = core.schema.check(schema.schema, conf) + if not ok then + return false, err + end + + if layer_enabled(conf, "semantic") then + if not (conf.semantic and conf.semantic.embedding) then + return false, "semantic layer requires semantic.embedding to be configured" + end + end + + core.utils.check_https({ "semantic.embedding.endpoint" }, conf, plugin_name) + + return true +end + + +function _M.access(conf, ctx) + -- Check bypass_on conditions + if conf.bypass_on then + local req_headers = ngx.req.get_headers() + for _, rule in ipairs(conf.bypass_on) do + if req_headers[rule.header] == rule.equals then + ctx.ai_cache_status = "BYPASS" + return + end + end + end + + local body_tab, err = core.request.get_json_request_body_table() + if not body_tab then + core.log.warn("ai-cache: failed to read request body: ", err or "unknown error") + ctx.ai_cache_status = "MISS" + return + end + + local protocol_name = protocols.detect(body_tab, ctx) + if not protocol_name then + core.log.warn("ai-cache: could not detect AI protocol, skipping cache") + ctx.ai_cache_status = "MISS" + return + end + + local proto = protocols.get(protocol_name) + local contents = proto.extract_request_content(body_tab) + if not contents or #contents == 0 then + ctx.ai_cache_status = "MISS" + return + end + + local prompt_text = table_concat(contents, " ") + local scope_hash = exact.compute_scope_hash(conf, ctx) + local prompt_hash = exact.compute_prompt_hash(prompt_text) + + local is_stream = body_tab.stream == true + + -- L1 exact lookup + if layer_enabled(conf, "exact") then + local cached_text, written_at, lookup_err = exact.get(conf, scope_hash, prompt_hash) + if lookup_err then + core.log.warn("ai-cache: L1 lookup error: ", lookup_err) + elseif cached_text then + core.log.info("ai-cache: L1 hit for key: ", prompt_hash) + ctx.ai_cache_status = "HIT-L1" + ctx.ai_cache_written_at = written_at + if is_stream then + core.response.set_header("Content-Type", "text/event-stream") + else + core.response.set_header("Content-Type", "application/json") + end + populate_ai_ctx_on_hit(ctx, protocol_name, body_tab, is_stream, cached_text) + -- TODO: rename build_deny_response to build_response_from_text in a + -- follow-up. We use it here to wrap cached text in the protocol's + -- response shape, not for policy denial. + return 200, proto.build_deny_response({ + stream = is_stream, + text = cached_text, + }) + end + end + + -- L2 semantic lookup + if layer_enabled(conf, "semantic") then + local emb_conf = conf.semantic.embedding + local emb_driver = require("apisix.plugins.ai-cache.embeddings." .. emb_conf.provider) + local httpc = http.new() + + local t0 = ngx_now() + local embedding, _, emb_err = emb_driver.get_embeddings( + emb_conf, prompt_text, httpc, emb_conf.ssl_verify + ) + if not embedding then + core.log.warn("ai-cache: embedding fetch failed (degrading to MISS): ", emb_err) + ctx.ai_cache_embedding_failed = true + else + ctx.ai_cache_embedding_latency_ms = (ngx_now() - t0) * 1000 + ctx.ai_cache_embedding_provider = emb_conf.provider + ctx.ai_cache_embedding = embedding + + local threshold = conf.semantic.similarity_threshold or 0.95 + local cached_text, similarity, search_err = semantic.search( + conf, scope_hash, embedding, threshold + ) + + if search_err then + core.log.warn("ai-cache: L2 search error (degrading to MISS): ", search_err) + elseif cached_text then + core.log.info("ai-cache: L2 hit, similarity=", similarity) + + if layer_enabled(conf, "exact") then + local l1_ttl = (conf.exact and conf.exact.ttl) or 3600 + local l1_err = exact.set( + conf, scope_hash, prompt_hash, cached_text, l1_ttl + ) + if l1_err then + core.log.warn("ai-cache: L2->L1 backfill failed: ", l1_err) + end + end + + ctx.ai_cache_status = "HIT-L2" + ctx.ai_cache_similarity = similarity + if is_stream then + core.response.set_header("Content-Type", "text/event-stream") + else + core.response.set_header("Content-Type", "application/json") + end + populate_ai_ctx_on_hit(ctx, protocol_name, body_tab, is_stream, cached_text) + return 200, proto.build_deny_response({ + stream = is_stream, + text = cached_text, + }) + end + end + end + + ctx.ai_cache_status = "MISS" + ctx.ai_cache_scope_hash = scope_hash + ctx.ai_cache_prompt_hash = prompt_hash + ctx.ai_cache_prompt_text = prompt_text +end + + +function _M.header_filter(conf, ctx) + if not ctx.ai_cache_status then + return + end + + local status_header = (conf.headers and conf.headers.cache_status) + or "X-AI-Cache-Status" + ngx.header[status_header] = ctx.ai_cache_status + + if ctx.ai_cache_status == "HIT-L1" and ctx.ai_cache_written_at then + local age_header = (conf.headers and conf.headers.cache_age) + or "X-AI-Cache-Age" + ngx.header[age_header] = tostring(ngx_time() - ctx.ai_cache_written_at) + end + + if ctx.ai_cache_status == "HIT-L2" and ctx.ai_cache_similarity then + local sim_header = (conf.headers and conf.headers.cache_similarity) + or "X-AI-Cache-Similarity" + ngx.header[sim_header] = tostring(ctx.ai_cache_similarity) + end +end + + +function _M.log(conf, ctx) + if ctx.ai_cache_status ~= "MISS" then + return + end + + -- Early-MISS paths (body parse / protocol detect / empty content) skip + -- key computation, so bail out if cache key fields are absent. + if not ctx.ai_cache_prompt_hash or not ctx.ai_cache_prompt_text then + return + end + + local upstream_status = core.response.get_upstream_status(ctx) or ngx.status + if not upstream_status or upstream_status < 200 or upstream_status >= 300 then + return + end + + local response_text = ctx.var.llm_response_text + if not response_text or response_text == "" then + return + end + + local max_size = conf.max_cache_body_size or 1048576 + if #response_text > max_size then + core.log.warn("ai-cache: response size ", #response_text, + " exceeds max_cache_body_size ", max_size, + ", skipping cache write") + return + end + + local exact_enabled = layer_enabled(conf, "exact") + local semantic_enabled = layer_enabled(conf, "semantic") + local ttl_exact = (conf.exact and conf.exact.ttl) or 3600 + local scope_hash = ctx.ai_cache_scope_hash + local prompt_hash = ctx.ai_cache_prompt_hash + local embedding = ctx.ai_cache_embedding + + local ok, timer_err = ngx.timer.at(0, function(premature) + if premature then + return + end + + if exact_enabled then + local err = exact.set(conf, scope_hash, prompt_hash, response_text, ttl_exact) + if err then + ngx.log(ngx.WARN, "ai-cache: failed to write L1 cache: ", err) + end + end + + if semantic_enabled then + if not embedding then + return + end + + local ttl_semantic = (conf.semantic and conf.semantic.ttl) or 86400 + local store_err = semantic.store( + conf, scope_hash, embedding, response_text, ttl_semantic + ) + if store_err then + ngx.log(ngx.WARN, "ai-cache: failed to write L2 cache: ", store_err) + end + end + end) + if not ok then + core.log.warn("ai-cache: failed to schedule cache write: ", timer_err) + end +end + + +return _M diff --git a/apisix/plugins/ai-cache/embeddings/azure_openai.lua b/apisix/plugins/ai-cache/embeddings/azure_openai.lua new file mode 100644 index 000000000000..6f862ea78cc8 --- /dev/null +++ b/apisix/plugins/ai-cache/embeddings/azure_openai.lua @@ -0,0 +1,76 @@ +-- +-- Licensed to the Apache Software Foundation (ASF) under one or more +-- contributor license agreements. See the NOTICE file distributed with +-- this work for additional information regarding copyright ownership. +-- The ASF licenses this file to You under the Apache License, Version 2.0 +-- (the "License"); you may not use this file except in compliance with +-- the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. +-- + +local core = require("apisix.core") +local type = type + +local ngx = ngx +local HTTP_OK = ngx.HTTP_OK +local HTTP_INTERNAL_SERVER_ERROR = ngx.HTTP_INTERNAL_SERVER_ERROR + +local _M = {} + + +function _M.get_embeddings(conf, text, httpc, ssl_verify) + local body, err = core.json.encode({ input = text }) + if not body then + return nil, HTTP_INTERNAL_SERVER_ERROR, err + end + + httpc:set_timeout(conf.timeout) + + local res, err = httpc:request_uri(conf.endpoint, { + method = "POST", + headers = { + ["Content-Type"] = "application/json", + ["api-key"] = conf.api_key, + }, + body = body, + ssl_verify = ssl_verify, + keepalive = true, + }) + + if not res or not res.body then + return nil, HTTP_INTERNAL_SERVER_ERROR, err or "no response from embeddings API" + end + + if res.status ~= HTTP_OK then + return nil, res.status, res.body + end + + local res_tab, err = core.json.decode(res.body) + if not res_tab then + return nil, HTTP_INTERNAL_SERVER_ERROR, err + end + + if type(res_tab.data) ~= "table" or core.table.isempty(res_tab.data) then + return nil, HTTP_INTERNAL_SERVER_ERROR, "unexpected embedding response: " .. res.body + end + + local embedding = res_tab.data[1].embedding + if type(embedding) ~= "table" then + return nil, HTTP_INTERNAL_SERVER_ERROR, "missing embedding field in response" + end + if #embedding == 0 then + return nil, HTTP_INTERNAL_SERVER_ERROR, "embedding vector is empty" + end + + return embedding, nil, nil +end + + +return _M diff --git a/apisix/plugins/ai-cache/embeddings/openai.lua b/apisix/plugins/ai-cache/embeddings/openai.lua new file mode 100644 index 000000000000..740b12d23f2d --- /dev/null +++ b/apisix/plugins/ai-cache/embeddings/openai.lua @@ -0,0 +1,79 @@ +-- +-- Licensed to the Apache Software Foundation (ASF) under one or more +-- contributor license agreements. See the NOTICE file distributed with +-- this work for additional information regarding copyright ownership. +-- The ASF licenses this file to You under the Apache License, Version 2.0 +-- (the "License"); you may not use this file except in compliance with +-- the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. +-- + +local core = require("apisix.core") +local type = type + +local ngx = ngx +local HTTP_OK = ngx.HTTP_OK +local HTTP_INTERNAL_SERVER_ERROR = ngx.HTTP_INTERNAL_SERVER_ERROR + +local _M = {} + + +function _M.get_embeddings(conf, text, httpc, ssl_verify) + local body, err = core.json.encode({ + input = text, + model = conf.model or "text-embedding-3-small", + }) + if not body then + return nil, HTTP_INTERNAL_SERVER_ERROR, err + end + + httpc:set_timeout(conf.timeout) + + local res, err = httpc:request_uri(conf.endpoint, { + method = "POST", + headers = { + ["Content-Type"] = "application/json", + ["Authorization"] = "Bearer " .. conf.api_key, + }, + body = body, + ssl_verify = ssl_verify, + keepalive = true, + }) + + if not res or not res.body then + return nil, HTTP_INTERNAL_SERVER_ERROR, err or "no response from embeddings API" + end + + if res.status ~= HTTP_OK then + return nil, res.status, res.body + end + + local res_tab, err = core.json.decode(res.body) + if not res_tab then + return nil, HTTP_INTERNAL_SERVER_ERROR, err + end + + if type(res_tab.data) ~= "table" or core.table.isempty(res_tab.data) then + return nil, HTTP_INTERNAL_SERVER_ERROR, "unexpected embedding response: " .. res.body + end + + local embedding = res_tab.data[1].embedding + if type(embedding) ~= "table" then + return nil, HTTP_INTERNAL_SERVER_ERROR, "missing embedding field in response" + end + if #embedding == 0 then + return nil, HTTP_INTERNAL_SERVER_ERROR, "embedding vector is empty" + end + + return embedding, nil, nil +end + + +return _M diff --git a/apisix/plugins/ai-cache/exact.lua b/apisix/plugins/ai-cache/exact.lua new file mode 100644 index 000000000000..e1a63f9a5f41 --- /dev/null +++ b/apisix/plugins/ai-cache/exact.lua @@ -0,0 +1,136 @@ +-- +-- Licensed to the Apache Software Foundation (ASF) under one or more +-- contributor license agreements. See the NOTICE file distributed with +-- this work for additional information regarding copyright ownership. +-- The ASF licenses this file to You under the Apache License, Version 2.0 +-- (the "License"); you may not use this file except in compliance with +-- the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. +-- + +local core = require("apisix.core") +local redis = require("apisix.utils.redis") +local resty_sha256 = require("resty.sha256") +local to_hex = require("resty.string").to_hex + +local ngx = ngx +local ngx_time = ngx.time +local ipairs = ipairs +local tostring = tostring +local table_concat = table.concat + +local KEY_PREFIX = "ai-cache:l1:" + +local _M = {} + + +local function sha256_hex(s) + local hash = resty_sha256:new() + hash:update(s) + return to_hex(hash:final()) +end + + +function _M.compute_scope_hash(conf, ctx) + local cache_key = conf.cache_key + if not cache_key then + return "" + end + + local parts = {} + local n = 0 + + if cache_key.include_consumer then + n = n + 1 + parts[n] = ctx.consumer_name or "" + end + + if cache_key.include_vars then + for _, var_name in ipairs(cache_key.include_vars) do + local key = var_name + if key:sub(1, 1) == "$" then + key = key:sub(2) + end + n = n + 1 + parts[n] = tostring(ctx.var[key] or "") + end + end + + if n == 0 then + return "" + end + + return sha256_hex(table_concat(parts, "|")) +end + + +function _M.compute_prompt_hash(text) + return sha256_hex(text) +end + + +function _M.get(conf, scope_hash, prompt_hash) + local red, err = redis.new(conf) + if not red then + return nil, nil, err + end + + local key = KEY_PREFIX .. scope_hash .. ":" .. prompt_hash + local res, get_err = red:get(key) + if get_err then + red:close() + return nil, nil, get_err + end + + red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool) + + if res == ngx.null then + return nil, nil, nil + end + + local entry, decode_err = core.json.decode(res) + if not entry then + return nil, nil, "corrupt cache entry: " .. decode_err + end + + return entry.text, entry.written_at, nil +end + + +function _M.set(conf, scope_hash, prompt_hash, text, ttl) + local red, err = redis.new(conf) + if not red then + return err + end + + local key = KEY_PREFIX .. scope_hash .. ":" .. prompt_hash + local entry, encode_err = core.json.encode({ + text = text, + written_at = ngx_time(), + }) + + if not entry then + red:close() + return encode_err + end + + local ok, set_err = red:set(key, entry, "EX", ttl) + if not ok then + red:close() + return set_err + end + + red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool) + + return nil +end + + +return _M diff --git a/apisix/plugins/ai-cache/schema.lua b/apisix/plugins/ai-cache/schema.lua new file mode 100644 index 000000000000..02587f7fb14c --- /dev/null +++ b/apisix/plugins/ai-cache/schema.lua @@ -0,0 +1,189 @@ +-- +-- Licensed to the Apache Software Foundation (ASF) under one or more +-- contributor license agreements. See the NOTICE file distributed with +-- this work for additional information regarding copyright ownership. +-- The ASF licenses this file to You under the Apache License, Version 2.0 +-- (the "License"); you may not use this file except in compliance with +-- the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. +-- + +local redis_schema = require("apisix.utils.redis-schema") + +local _M = {} + +local embedding_schema = { + type = "object", + properties = { + provider = { + type = "string", + enum = { "openai", "azure_openai" }, + description = "Embedding API provider.", + }, + model = { + type = "string", + description = "Embedding model name. Sent in the request body for " + .. "provider: openai; ignored for provider: azure_openai " + .. "(Azure infers the model from the deployment URL).", + }, + endpoint = { + type = "string", + description = "Embedding API endpoint URL.", + }, + api_key = { + type = "string", + description = "API key for the embedding provider.", + }, + timeout = { + type = "integer", + minimum = 1, + maximum = 600000, + default = 5000, + description = "HTTP request timeout in milliseconds for embedding API calls.", + }, + ssl_verify = { + type = "boolean", + default = true, + description = "Whether to verify the embedding endpoint's TLS certificate.", + }, + }, + required = { "provider", "endpoint", "api_key" }, +} + +local semantic_schema = { + type = "object", + properties = { + similarity_threshold = { + type = "number", + minimum = 0, + maximum = 1, + default = 0.95, + description = "Minimum cosine similarity required for a semantic-layer hit.", + }, + top_k = { + type = "integer", + minimum = 1, + maximum = 100, + default = 1, + description = "Number of nearest-neighbor candidates the index returns; " + .. "the first candidate above similarity_threshold is used.", + }, + ttl = { + type = "integer", + minimum = 1, + default = 86400, + description = "Time-to-live in seconds for semantic-layer entries.", + }, + embedding = embedding_schema, + }, + required = { "embedding" }, +} + +local exact_schema = { + type = "object", + properties = { + ttl = { + type = "integer", + minimum = 1, + default = 3600, + description = "Time-to-live in seconds for exact-layer entries.", + }, + }, +} + + +local bypass_item_schema = { + type = "object", + properties = { + header = { + type = "string", + description = "Request header name to inspect.", + }, + equals = { + type = "string", + description = "Value to match against the header. " + .. "If equal, the request bypasses the cache.", + }, + }, + required = { "header", "equals" }, +} + +local headers_schema = { + type = "object", + properties = { + cache_status = { + type = "string", + default = "X-AI-Cache-Status", + description = "Response header name for cache status " + .. "(HIT-L1 / HIT-L2 / MISS / BYPASS).", + }, + cache_similarity = { + type = "string", + default = "X-AI-Cache-Similarity", + description = "Response header name for the similarity score of a semantic-layer hit.", + }, + cache_age = { + type = "string", + default = "X-AI-Cache-Age", + description = "Response header name for the age in seconds of an exact-layer hit.", + }, + }, +} + +_M.schema = { + type = "object", + properties = { + layers = { + type = "array", + items = { type = "string", enum = { "exact", "semantic" } }, + uniqueItems = true, + minItems = 1, + default = { "exact", "semantic" }, + description = "Cache layers to enable, queried in order.", + }, + cache_key = { + type = "object", + properties = { + include_consumer = { + type = "boolean", + default = false, + description = "If true, partition the cache by consumer name.", + }, + include_vars = { + type = "array", + items = { type = "string" }, + default = {}, + description = "Additional ctx.var names included in the cache key, " + .. "for example [\"$http_x_tenant_id\"].", + }, + }, + }, + exact = exact_schema, + semantic = semantic_schema, + bypass_on = { + type = "array", + items = bypass_item_schema, + description = "List of {header, equals} rules. " + .. "If any matches, the request bypasses the cache.", + }, + headers = headers_schema, + max_cache_body_size = { + type = "integer", + minimum = 1, + default = 1048576, + description = "Maximum response size in bytes to write to cache. " + .. "Larger responses pass through but are not cached.", + }, + }, + allOf = { redis_schema.schema.redis }, + encrypt_fields = { "semantic.embedding.api_key", "redis_password" }, +} + +return _M diff --git a/apisix/plugins/ai-cache/semantic.lua b/apisix/plugins/ai-cache/semantic.lua new file mode 100644 index 000000000000..6d84dbd28675 --- /dev/null +++ b/apisix/plugins/ai-cache/semantic.lua @@ -0,0 +1,212 @@ +-- +-- Licensed to the Apache Software Foundation (ASF) under one or more +-- contributor license agreements. See the NOTICE file distributed with +-- this work for additional information regarding copyright ownership. +-- The ASF licenses this file to You under the Apache License, Version 2.0 +-- (the "License"); you may not use this file except in compliance with +-- the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. +-- + +local apisix_redis = require("apisix.utils.redis") +local uuid = require("resty.jit-uuid") +local ffi = require("ffi") + +local ffi_new = ffi.new +local ffi_string = ffi.string +local ngx_time = ngx.time +local tostring = tostring +local tonumber = tonumber +local type = type + +local _M = {} + + +local function index_name(dim) + return "ai-cache-idx-" .. dim +end + + +local function key_prefix(dim) + return "ai-cache:l2:" .. dim .. ":" +end + +local function pack_vector(vec) + local n = #vec + local buf = ffi_new("float[?]", n) + for i = 0, n - 1 do + buf[i] = vec[i + 1] + end + return ffi_string(buf, n * 4) +end + +local index_ready = {} +local index_unsupported = false + +local function ensure_index(red, dim) + if index_unsupported then + return nil, "RediSearch not supported on this Redis instance" + end + + if index_ready[dim] then + return true + end + + local _, err = red["FT.CREATE"](red, + index_name(dim), + "ON", "HASH", + "PREFIX", "1", key_prefix(dim), + "SCHEMA", + "embedding", "VECTOR", "HNSW", "6", + "TYPE", "FLOAT32", + "DIM", tostring(dim), + "DISTANCE_METRIC", "COSINE", + "scope", "TAG", + "created_at", "NUMERIC" + ) + + if err then + -- RediSearch module absent — latch and stop retrying on every request + if err:find("unknown command", 1, true) + or err:find("ERR unknown", 1, true) then + index_unsupported = true + return nil, "RediSearch not supported on this Redis instance: " .. err + end + if not err:find("already exists") then + return nil, "FT.CREATE failed: " .. err + end + end + + index_ready[dim] = true + return true +end + + +function _M.search(conf, scope_hash, embedding_vec, threshold) + local red, err = apisix_redis.new(conf) + if not red then + return nil, nil, err + end + + local ok, init_err = ensure_index(red, #embedding_vec) + if not ok then + red:close() + return nil, nil, init_err + end + + local binary_vec = pack_vector(embedding_vec) + local top_k = (conf.semantic and conf.semantic.top_k) or 1 + local top_k_str = tostring(top_k) + + local query + if scope_hash == "" then + query = "*=>[KNN " .. top_k_str .. " @embedding $vec AS dist]" + else + query = "@scope:{" .. scope_hash .. "}=>[KNN " .. top_k_str + .. " @embedding $vec AS dist]" + end + + local res, search_err = red["FT.SEARCH"](red, + index_name(#embedding_vec), + query, + "PARAMS", "2", "vec", binary_vec, + "SORTBY", "dist", "ASC", + "LIMIT", "0", top_k_str, + "RETURN", "2", "response", "dist", + "DIALECT", "2" + ) + + if search_err then + red:close() + -- index was dropped externally — invalidate so next call recreates + if search_err:find("Unknown Index name", 1, true) then + index_ready[#embedding_vec] = nil + end + return nil, nil, search_err + end + + red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool) + + if not res or res[1] == 0 then + return nil, nil, nil + end + + -- RESP2: {count, key1, fields1, key2, fields2, ...} + -- Results are sorted by dist ASC. Iterate candidates and return the first + -- one whose similarity meets the threshold; skip candidates with missing + -- or corrupt fields. + for i = 3, #res, 2 do + local fields = res[i] + if type(fields) == "table" then + local response_text, dist + for j = 1, #fields, 2 do + if fields[j] == "response" then + response_text = fields[j + 1] + elseif fields[j] == "dist" then + dist = tonumber(fields[j + 1]) + end + end + + if response_text and dist then + local similarity = 1 - dist + if similarity >= threshold then + return response_text, similarity, nil + end + end + end + end + + return nil, nil, nil +end + + +function _M.store(conf, scope_hash, embedding_vec, text, ttl) + local red, err = apisix_redis.new(conf) + if not red then + return err + end + + local ok, init_err = ensure_index(red, #embedding_vec) + if not ok then + red:close() + return init_err + end + + local binary_vec = pack_vector(embedding_vec) + local key = key_prefix(#embedding_vec) .. uuid.generate_v4() + + -- HSET + EXPIRE wrapped in MULTI/EXEC so the entry is never written + -- without its TTL (which would orphan it in Redis forever). + local _, multi_err = red:multi() + if multi_err then + red:close() + return multi_err + end + + red:hset(key, + "embedding", binary_vec, + "response", text, + "scope", scope_hash, + "created_at", tostring(ngx_time()) + ) + red:expire(key, ttl) + + local results, exec_err = red:exec() + if not results then + red:close() + return exec_err + end + + red:set_keepalive(conf.redis_keepalive_timeout, conf.redis_keepalive_pool) + return nil +end + + +return _M diff --git a/apisix/plugins/prometheus/exporter.lua b/apisix/plugins/prometheus/exporter.lua index ce89ca03302a..78ce1bac0bf5 100644 --- a/apisix/plugins/prometheus/exporter.lua +++ b/apisix/plugins/prometheus/exporter.lua @@ -160,6 +160,14 @@ function _M.http_init(prometheus_enabled_in_stream) "llm_completion_tokens", "expire") local llm_active_connections_exptime = core.table.try_read_attr(attr, "metrics", "llm_active_connections", "expire") + local ai_cache_hits_exptime = core.table.try_read_attr(attr, "metrics", + "ai_cache_hits", "expire") + local ai_cache_misses_exptime = core.table.try_read_attr(attr, "metrics", + "ai_cache_misses", "expire") + local ai_cache_embedding_latency_exptime = core.table.try_read_attr(attr, "metrics", + "ai_cache_embedding_latency", "expire") + local ai_cache_embedding_failures_exptime = core.table.try_read_attr(attr, "metrics", + "ai_cache_embedding_failures", "expire") prometheus = base_prometheus.init("prometheus-metrics", metric_prefix) @@ -260,6 +268,35 @@ function _M.http_init(prometheus_enabled_in_stream) unpack(extra_labels("llm_active_connections"))}, llm_active_connections_exptime) + metrics.ai_cache_hits = prometheus:counter("ai_cache_hits_total", + "AI cache hit count by layer", + {"route_id", "service_id", "consumer", "layer", + unpack(extra_labels("ai_cache_hits"))}, + ai_cache_hits_exptime) + + metrics.ai_cache_misses = prometheus:counter("ai_cache_misses_total", + "AI cache miss count", + {"route_id", "service_id", "consumer", + unpack(extra_labels("ai_cache_misses"))}, + ai_cache_misses_exptime) + + local ai_cache_embedding_latency_buckets = DEFAULT_BUCKETS + if attr and attr.ai_cache_embedding_latency_buckets then + ai_cache_embedding_latency_buckets = attr.ai_cache_embedding_latency_buckets + end + metrics.ai_cache_embedding_latency = prometheus:histogram("ai_cache_embedding_latency", + "AI cache embedding API call latency in milliseconds", + {"route_id", "service_id", "consumer", "provider", + unpack(extra_labels("ai_cache_embedding_latency"))}, + ai_cache_embedding_latency_buckets, + ai_cache_embedding_latency_exptime) + + metrics.ai_cache_embedding_failures = prometheus:counter("ai_cache_embedding_failures_total", + "AI cache embedding API call failure count", + {"route_id", "service_id", "consumer", + unpack(extra_labels("ai_cache_embedding_failures"))}, + ai_cache_embedding_failures_exptime) + if prometheus_enabled_in_stream then init_stream_metrics() end @@ -377,6 +414,35 @@ function _M.http_log(conf, ctx) vars.request_type, vars.request_llm_model, vars.llm_model, unpack(extra_labels("llm_completion_tokens", ctx)))) end + + if ctx.ai_cache_status then + if ctx.ai_cache_status == "HIT-L1" then + metrics.ai_cache_hits:inc(1, + gen_arr(route_id, service_id, consumer_name, "l1", + unpack(extra_labels("ai_cache_hits", ctx)))) + elseif ctx.ai_cache_status == "HIT-L2" then + metrics.ai_cache_hits:inc(1, + gen_arr(route_id, service_id, consumer_name, "l2", + unpack(extra_labels("ai_cache_hits", ctx)))) + elseif ctx.ai_cache_status == "MISS" then + metrics.ai_cache_misses:inc(1, + gen_arr(route_id, service_id, consumer_name, + unpack(extra_labels("ai_cache_misses", ctx)))) + end + + if ctx.ai_cache_embedding_latency_ms then + metrics.ai_cache_embedding_latency:observe(ctx.ai_cache_embedding_latency_ms, + gen_arr(route_id, service_id, consumer_name, + ctx.ai_cache_embedding_provider or "", + unpack(extra_labels("ai_cache_embedding_latency", ctx)))) + end + + if ctx.ai_cache_embedding_failed then + metrics.ai_cache_embedding_failures:inc(1, + gen_arr(route_id, service_id, consumer_name, + unpack(extra_labels("ai_cache_embedding_failures", ctx)))) + end + end end @@ -790,6 +856,7 @@ function _M.dec_llm_active_connections(ctx) inc_llm_active_connections(ctx, -1) end + function _M.get_prometheus() return prometheus end diff --git a/conf/config.yaml.example b/conf/config.yaml.example index ae7155a86b06..901774540d70 100644 --- a/conf/config.yaml.example +++ b/conf/config.yaml.example @@ -514,6 +514,7 @@ plugins: # plugin list (sorted by priority) - ai-prompt-template # priority: 1071 - ai-prompt-decorator # priority: 1070 - ai-prompt-guard # priority: 1072 + - ai-cache # priority: 1065 - ai-rag # priority: 1060 - ai-aws-content-moderation # priority: 1050 - ai-proxy-multi # priority: 1041 diff --git a/docs/en/latest/config.json b/docs/en/latest/config.json index d24eacc3f8e9..c198826c7505 100644 --- a/docs/en/latest/config.json +++ b/docs/en/latest/config.json @@ -75,6 +75,7 @@ "plugins/ai-proxy-multi", "plugins/ai-rate-limiting", "plugins/ai-prompt-guard", + "plugins/ai-cache", "plugins/ai-aws-content-moderation", "plugins/ai-aliyun-content-moderation", "plugins/ai-prompt-decorator", diff --git a/docs/en/latest/plugins/ai-cache.md b/docs/en/latest/plugins/ai-cache.md new file mode 100644 index 000000000000..523b727f836f --- /dev/null +++ b/docs/en/latest/plugins/ai-cache.md @@ -0,0 +1,1194 @@ +--- +title: ai-cache +keywords: + - Apache APISIX + - API Gateway + - Plugin + - ai-cache +description: The ai-cache Plugin caches LLM responses in Redis so identical or semantically similar prompts are served from cache, reducing latency and upstream cost. +--- + + + + + + + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Description + +The `ai-cache` Plugin caches LLM responses in Redis so identical or semantically similar prompts are served from cache instead of incurring another upstream call. It supports two cache layers: an exact-match layer (`exact`) keyed by a hash of the prompt, and a semantic layer (`semantic`) that compares prompt embeddings via Redis Stack vector search. Either layer can be enabled independently, and when both are enabled a hit on the semantic layer backfills the exact layer so subsequent identical prompts return immediately. + +The Plugin should be used together with [ai-proxy](./ai-proxy.md) or [ai-proxy-multi](./ai-proxy-multi.md) on the same Route. The semantic layer requires Redis Stack with the RediSearch module and an embedding provider (OpenAI or Azure OpenAI). PRs for additional embedding providers are welcomed. + +## Plugin Attributes + +| Name | Type | Required | Default | Valid values | Description | +| --- | --- | --- | --- | --- | --- | +| `layers` | array[string] | False | `["exact", "semantic"]` | `"exact"`, `"semantic"` | Cache layers to enable, queried in order. | +| `exact.ttl` | integer | False | `3600` | ≥ 1 | Time-to-live in seconds for exact-layer entries. | +| `semantic.similarity_threshold` | number | False | `0.95` | 0–1 | Minimum cosine similarity required for a semantic-layer hit. | +| `semantic.top_k` | integer | False | `1` | [1, 100] | Number of nearest-neighbor candidates the index returns; the first candidate above `similarity_threshold` is used. | +| `semantic.ttl` | integer | False | `86400` | ≥ 1 | Time-to-live in seconds for semantic-layer entries. | +| `semantic.embedding.provider` | string | True (if semantic enabled) | | `"openai"`, `"azure_openai"` | Embedding API provider. | +| `semantic.embedding.endpoint` | string | True (if semantic enabled) | | | Embedding API endpoint URL. | +| `semantic.embedding.api_key` | string | True (if semantic enabled) | | | API key for the embedding provider. Stored encrypted. | +| `semantic.embedding.model` | string | False | | | Embedding model name. Sent in the request body for `provider: openai`; ignored for `provider: azure_openai` (Azure infers the model from the deployment URL). Uses provider default if omitted. | +| `semantic.embedding.timeout` | integer | False | `5000` | [1, 600000] | HTTP request timeout in milliseconds for embedding API calls. | +| `semantic.embedding.ssl_verify` | boolean | False | `true` | | Whether to verify the embedding endpoint's TLS certificate. | +| `cache_key.include_consumer` | boolean | False | `false` | | If `true`, partition the cache by consumer name. | +| `cache_key.include_vars` | array[string] | False | `[]` | | Additional `ctx.var` names included in the cache key, for example `["$http_x_tenant_id"]`. | +| `bypass_on` | array[object] | False | | | List of `{header, equals}` rules. If any matches, the request bypasses the cache. | +| `max_cache_body_size` | integer | False | `1048576` | ≥ 1 | Maximum response size in bytes to write to cache. Larger responses pass through but are not cached. | +| `headers.cache_status` | string | False | `"X-AI-Cache-Status"` | | Response header for cache status (`HIT-L1`, `HIT-L2`, `MISS`, `BYPASS`). | +| `headers.cache_age` | string | False | `"X-AI-Cache-Age"` | | Response header for the age in seconds of an exact-layer hit. | +| `headers.cache_similarity` | string | False | `"X-AI-Cache-Similarity"` | | Response header for the similarity score of a semantic-layer hit. | +| `redis_host` | string | True | | | The address of the Redis node. | +| `redis_port` | integer | False | `6379` | [1,...] | The port of the Redis node. | +| `redis_username` | string | False | | | The username for Redis if Redis ACL is used. If you use the legacy authentication method `requirepass`, configure only the `redis_password`. | +| `redis_password` | string | False | | | The password of the Redis node. | +| `redis_database` | integer | False | `0` | >= 0 | The database number in Redis. | +| `redis_timeout` | integer | False | `1000` | [1,...] | The Redis timeout value in milliseconds. | +| `redis_ssl` | boolean | False | `false` | | If `true`, use SSL to connect to Redis. | +| `redis_ssl_verify` | boolean | False | `false` | | If `true`, verify the server SSL certificate. | +| `redis_keepalive_timeout` | integer | False | `10000` | [1000,...] | Idle timeout in milliseconds for the Redis connection in the keepalive pool. | +| `redis_keepalive_pool` | integer | False | `100` | [1,...] | Maximum number of idle Redis connections kept in the keepalive pool. | + +## Examples + +The following examples use OpenAI as the Upstream service provider. Before proceeding, create an [OpenAI account](https://openai.com) and an [API key](https://openai.com/blog/openai-api). You can optionally save the key to an environment variable: + +```shell +export OPENAI_API_KEY= +``` + +If you are working with other LLM providers, please refer to the provider's documentation to obtain an API key. + +:::note + +You can fetch the `admin_key` from `config.yaml` and save to an environment variable with the following command: + +```shell +admin_key=$(yq '.deployment.admin.admin_key[0].key' conf/config.yaml | sed 's/"//g') +``` + +::: + +### Cache Identical Prompts with the Exact Layer + +The following example demonstrates how to use the `ai-cache` Plugin with the exact layer only, so that identical prompts are returned from cache. + + + + +Create a Route that uses [ai-proxy](./ai-proxy.md) to proxy to OpenAI and `ai-cache` to cache exact-match prompts: + +```shell +curl "http://127.0.0.1:9180/apisix/admin/routes/1" -X PUT \ + -H "X-API-KEY: ${admin_key}" \ + -d '{ + "uri": "/anything", + "methods": ["POST"], + "plugins": { + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer '"$OPENAI_API_KEY"'" + } + }, + "options": { + "model": "gpt-4" + } + }, + "ai-cache": { + "layers": ["exact"], + "exact": { "ttl": 3600 }, + "redis_host": "127.0.0.1" + } + } + }' +``` + + + + +Create a Route with the `ai-cache` and [ai-proxy](./ai-proxy.md) Plugins configured as such: + +```yaml title="adc.yaml" +services: + - name: ai-cache-service + routes: + - name: ai-cache-route + uris: + - /anything + methods: + - POST + plugins: + ai-proxy: + provider: openai + auth: + header: + Authorization: "Bearer ${OPENAI_API_KEY}" + options: + model: gpt-4 + ai-cache: + layers: + - exact + exact: + ttl: 3600 + redis_host: 127.0.0.1 +``` + +Synchronize the configuration to the gateway: + +```shell +adc sync -f adc.yaml +``` + + + + + + + +Create a Route with the `ai-cache` and [ai-proxy](./ai-proxy.md) Plugins configured as such: + +```yaml title="ai-cache-ic.yaml" +apiVersion: apisix.apache.org/v1alpha1 +kind: PluginConfig +metadata: + namespace: aic + name: ai-cache-plugin-config +spec: + plugins: + - name: ai-proxy + config: + provider: openai + auth: + header: + Authorization: "Bearer your-api-key" + options: + model: gpt-4 + - name: ai-cache + config: + layers: + - exact + exact: + ttl: 3600 + redis_host: redis-stack +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + namespace: aic + name: ai-cache-route +spec: + parentRefs: + - name: apisix + rules: + - matches: + - path: + type: Exact + value: /anything + method: POST + filters: + - type: ExtensionRef + extensionRef: + group: apisix.apache.org + kind: PluginConfig + name: ai-cache-plugin-config +``` + + + + +Create a Route with the `ai-cache` and [ai-proxy](./ai-proxy.md) Plugins configured as such: + +```yaml title="ai-cache-ic.yaml" +apiVersion: apisix.apache.org/v2 +kind: ApisixRoute +metadata: + namespace: aic + name: ai-cache-route +spec: + ingressClassName: apisix + http: + - name: ai-cache-route + match: + paths: + - /anything + methods: + - POST + plugins: + - name: ai-proxy + enable: true + config: + provider: openai + auth: + header: + Authorization: "Bearer your-api-key" + options: + model: gpt-4 + - name: ai-cache + enable: true + config: + layers: + - exact + exact: + ttl: 3600 + redis_host: redis-stack +``` + + + + +Apply the configuration to your cluster: + +```shell +kubectl apply -f ai-cache-ic.yaml +``` + + + + +Send a request to the Route: + +```shell +curl -i "http://127.0.0.1:9080/anything" -X POST \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [ + { "role": "user", "content": "What is the capital of France?" } + ] + }' +``` + +The first request reaches OpenAI. Note the `X-AI-Cache-Status: MISS` header, indicating the prompt was not in cache and APISIX forwarded the request upstream: + +```text +HTTP/1.1 200 OK +Content-Type: application/json +Server: APISIX/3.16.0 +X-AI-Cache-Status: MISS + +{ + "id": "chatcmpl-Da7Iqsqz9gc8Mkf07Hn4NCzAH5Ri1", + "object": "chat.completion", + "created": 1777500252, + "model": "gpt-4o-mini-2024-07-18", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "The capital of France is Paris.", + "refusal": null + }, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 14, + "completion_tokens": 7, + "total_tokens": 21 + }, + "system_fingerprint": "fp_d3214ccada" +} +``` + +Send the same request again: + +```shell +curl -i "http://127.0.0.1:9080/anything" -X POST \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [ + { "role": "user", "content": "What is the capital of France?" } + ] + }' +``` + +The second request returns from cache without contacting OpenAI. The `X-AI-Cache-Status: HIT-L1` header signals an exact-match hit and `X-AI-Cache-Age` reports the entry's age in seconds. The cached response is replayed from Redis, so the body is shorter and does not contain the original `created`, `model`, `usage`, or `system_fingerprint` fields: + +```text +HTTP/1.1 200 OK +Content-Type: application/json +Server: APISIX/3.16.0 +X-AI-Cache-Status: HIT-L1 +X-AI-Cache-Age: 4 + +{ + "id": "f558665e-3a03-42e3-9aa9-f54c402927c0", + "object": "chat.completion", + "choices": [ + { + "index": 0, + "message": { + "content": "The capital of France is Paris.", + "role": "assistant" + }, + "finish_reason": "stop" + } + ] +} +``` + +### Cache Paraphrased Prompts with the Semantic Layer + +The following example demonstrates how to enable the semantic layer so that prompts with different wording but similar meaning are served from cache. + + + + +```shell +curl "http://127.0.0.1:9180/apisix/admin/routes/1" -X PUT \ + -H "X-API-KEY: ${admin_key}" \ + -d '{ + "uri": "/anything", + "methods": ["POST"], + "plugins": { + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer '"$OPENAI_API_KEY"'" + } + }, + "options": { + "model": "gpt-4" + } + }, + "ai-cache": { + "layers": ["exact", "semantic"], + "exact": { "ttl": 3600 }, + "semantic": { + "similarity_threshold": 0.85, + "ttl": 86400, + "embedding": { + "provider": "openai", + "endpoint": "https://api.openai.com/v1/embeddings", + "api_key": "'"$OPENAI_API_KEY"'", + "model": "text-embedding-3-small" + } + }, + "redis_host": "127.0.0.1" + } + } + }' +``` + + + + +```yaml title="adc.yaml" +services: + - name: ai-cache-service + routes: + - name: ai-cache-route + uris: + - /anything + methods: + - POST + plugins: + ai-proxy: + provider: openai + auth: + header: + Authorization: "Bearer ${OPENAI_API_KEY}" + options: + model: gpt-4 + ai-cache: + layers: + - exact + - semantic + exact: + ttl: 3600 + semantic: + similarity_threshold: 0.85 + ttl: 86400 + embedding: + provider: openai + endpoint: https://api.openai.com/v1/embeddings + api_key: "${OPENAI_API_KEY}" + model: text-embedding-3-small + redis_host: 127.0.0.1 +``` + +Synchronize the configuration to the gateway: + +```shell +adc sync -f adc.yaml +``` + + + + + + + +```yaml title="ai-cache-ic.yaml" +apiVersion: apisix.apache.org/v1alpha1 +kind: PluginConfig +metadata: + namespace: aic + name: ai-cache-plugin-config +spec: + plugins: + - name: ai-proxy + config: + provider: openai + auth: + header: + Authorization: "Bearer your-api-key" + options: + model: gpt-4 + - name: ai-cache + config: + layers: + - exact + - semantic + exact: + ttl: 3600 + semantic: + similarity_threshold: 0.85 + ttl: 86400 + embedding: + provider: openai + endpoint: https://api.openai.com/v1/embeddings + api_key: your-api-key + model: text-embedding-3-small + redis_host: redis-stack +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + namespace: aic + name: ai-cache-route +spec: + parentRefs: + - name: apisix + rules: + - matches: + - path: + type: Exact + value: /anything + method: POST + filters: + - type: ExtensionRef + extensionRef: + group: apisix.apache.org + kind: PluginConfig + name: ai-cache-plugin-config +``` + + + + +```yaml title="ai-cache-ic.yaml" +apiVersion: apisix.apache.org/v2 +kind: ApisixRoute +metadata: + namespace: aic + name: ai-cache-route +spec: + ingressClassName: apisix + http: + - name: ai-cache-route + match: + paths: + - /anything + methods: + - POST + plugins: + - name: ai-proxy + enable: true + config: + provider: openai + auth: + header: + Authorization: "Bearer your-api-key" + options: + model: gpt-4 + - name: ai-cache + enable: true + config: + layers: + - exact + - semantic + exact: + ttl: 3600 + semantic: + similarity_threshold: 0.85 + ttl: 86400 + embedding: + provider: openai + endpoint: https://api.openai.com/v1/embeddings + api_key: your-api-key + model: text-embedding-3-small + redis_host: redis-stack +``` + + + + +Apply the configuration to your cluster: + +```shell +kubectl apply -f ai-cache-ic.yaml +``` + + + + +Send a first request: + +```shell +curl -i "http://127.0.0.1:9080/anything" -X POST \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [ + { "role": "user", "content": "What is the capital city of China?" } + ] + }' +``` + +The first request reaches OpenAI with `X-AI-Cache-Status: MISS`: + +```text +HTTP/1.1 200 OK +Content-Type: application/json +Server: APISIX/3.16.0 +X-AI-Cache-Status: MISS + +{ + "id": "chatcmpl-DcCIDs6ZJisclo84FUk5fT2Ks5vzn", + "object": "chat.completion", + "model": "gpt-4-0613", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "The capital city of China is Beijing." + }, + "finish_reason": "stop" + } + ], + "usage": { "prompt_tokens": 15, "completion_tokens": 8, "total_tokens": 23 } +} +``` + +Wait a couple of seconds for the semantic-layer write to complete in the background, then send a second request with paraphrased wording: + +```shell +curl -i "http://127.0.0.1:9080/anything" -X POST \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [ + { "role": "user", "content": "Capital city of China?" } + ] + }' +``` + +The semantic layer matches the embedding (cosine similarity above the threshold) and returns the cached response without contacting OpenAI. The `X-AI-Cache-Status: HIT-L2` header signals a semantic-layer hit and `X-AI-Cache-Similarity` reports the cosine similarity score: + +```text +HTTP/1.1 200 OK +Content-Type: application/json +Server: APISIX/3.16.0 +X-AI-Cache-Status: HIT-L2 +X-AI-Cache-Similarity: 0.9065774679184 + +{ + "id": "a95488bb-4a51-491a-bd5b-2c1d0e5f8a9b", + "object": "chat.completion", + "choices": [ + { + "index": 0, + "message": { + "content": "The capital city of China is Beijing.", + "role": "assistant" + }, + "finish_reason": "stop" + } + ] +} +``` + +When the `exact` layer is also enabled (as in this example), a semantic-layer hit backfills it, so an immediate retry of the same paraphrase returns `X-AI-Cache-Status: HIT-L1`. + +### Isolate Cache Entries Per Consumer or Tenant + +The following example demonstrates how to namespace cache entries so that one consumer's response is not served to another. Use `cache_key.include_consumer` to partition by consumer name, or `cache_key.include_vars` to include request variables such as a tenant header. + + + + +```shell +curl "http://127.0.0.1:9180/apisix/admin/routes/1" -X PUT \ + -H "X-API-KEY: ${admin_key}" \ + -d '{ + "uri": "/anything", + "methods": ["POST"], + "plugins": { + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer '"$OPENAI_API_KEY"'" + } + }, + "options": { + "model": "gpt-4" + } + }, + "ai-cache": { + "layers": ["exact"], + "exact": { "ttl": 3600 }, + "cache_key": { + "include_consumer": true, + "include_vars": ["$http_x_tenant_id"] + }, + "redis_host": "127.0.0.1" + } + } + }' +``` + + + + +```yaml title="adc.yaml" +services: + - name: ai-cache-service + routes: + - name: ai-cache-route + uris: + - /anything + methods: + - POST + plugins: + ai-proxy: + provider: openai + auth: + header: + Authorization: "Bearer ${OPENAI_API_KEY}" + options: + model: gpt-4 + ai-cache: + layers: + - exact + exact: + ttl: 3600 + cache_key: + include_consumer: true + include_vars: + - "$http_x_tenant_id" + redis_host: 127.0.0.1 +``` + +Synchronize the configuration to the gateway: + +```shell +adc sync -f adc.yaml +``` + + + + + + + +```yaml title="ai-cache-ic.yaml" +apiVersion: apisix.apache.org/v1alpha1 +kind: PluginConfig +metadata: + namespace: aic + name: ai-cache-plugin-config +spec: + plugins: + - name: ai-proxy + config: + provider: openai + auth: + header: + Authorization: "Bearer your-api-key" + options: + model: gpt-4 + - name: ai-cache + config: + layers: + - exact + exact: + ttl: 3600 + cache_key: + include_consumer: true + include_vars: + - "$http_x_tenant_id" + redis_host: redis-stack +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + namespace: aic + name: ai-cache-route +spec: + parentRefs: + - name: apisix + rules: + - matches: + - path: + type: Exact + value: /anything + method: POST + filters: + - type: ExtensionRef + extensionRef: + group: apisix.apache.org + kind: PluginConfig + name: ai-cache-plugin-config +``` + + + + +```yaml title="ai-cache-ic.yaml" +apiVersion: apisix.apache.org/v2 +kind: ApisixRoute +metadata: + namespace: aic + name: ai-cache-route +spec: + ingressClassName: apisix + http: + - name: ai-cache-route + match: + paths: + - /anything + methods: + - POST + plugins: + - name: ai-proxy + enable: true + config: + provider: openai + auth: + header: + Authorization: "Bearer your-api-key" + options: + model: gpt-4 + - name: ai-cache + enable: true + config: + layers: + - exact + exact: + ttl: 3600 + cache_key: + include_consumer: true + include_vars: + - "$http_x_tenant_id" + redis_host: redis-stack +``` + + + + +Apply the configuration to your cluster: + +```shell +kubectl apply -f ai-cache-ic.yaml +``` + + + + +Send a first request as `tenant-a`: + +```shell +curl -i "http://127.0.0.1:9080/anything" -X POST \ + -H "Content-Type: application/json" \ + -H "X-Tenant-Id: tenant-a" \ + -d '{ + "messages": [ + { "role": "user", "content": "What is the capital city of Japan?" } + ] + }' +``` + +The first request reaches OpenAI with `X-AI-Cache-Status: MISS` and primes `tenant-a`'s cache scope: + +```text +HTTP/1.1 200 OK +Content-Type: application/json +Server: APISIX/3.16.0 +X-AI-Cache-Status: MISS + +{ + "id": "chatcmpl-DcCRAzeSsimIOIeLQWsKtDxMLAAhu", + "object": "chat.completion", + "model": "gpt-4-0613", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "The capital city of Japan is Tokyo." + }, + "finish_reason": "stop" + } + ], + "usage": { "prompt_tokens": 15, "completion_tokens": 8, "total_tokens": 23 } +} +``` + +Repeat the same prompt as `tenant-a`: + +```shell +curl -i "http://127.0.0.1:9080/anything" -X POST \ + -H "Content-Type: application/json" \ + -H "X-Tenant-Id: tenant-a" \ + -d '{ + "messages": [ + { "role": "user", "content": "What is the capital city of Japan?" } + ] + }' +``` + +The second request returns from cache without contacting OpenAI. The `X-AI-Cache-Status: HIT-L1` header confirms `tenant-a`'s entry was reused: + +```text +HTTP/1.1 200 OK +Content-Type: application/json +Server: APISIX/3.16.0 +X-AI-Cache-Status: HIT-L1 +X-AI-Cache-Age: 6 + +{ + "id": "6be4f7a2-83f1-4cdc-8654-cee0396bd4f3", + "object": "chat.completion", + "choices": [ + { + "index": 0, + "message": { + "content": "The capital city of Japan is Tokyo.", + "role": "assistant" + }, + "finish_reason": "stop" + } + ] +} +``` + +Send the same prompt as a different tenant, `tenant-b`: + +```shell +curl -i "http://127.0.0.1:9080/anything" -X POST \ + -H "Content-Type: application/json" \ + -H "X-Tenant-Id: tenant-b" \ + -d '{ + "messages": [ + { "role": "user", "content": "What is the capital city of Japan?" } + ] + }' +``` + +Even though the prompt is identical, the request reaches OpenAI with `X-AI-Cache-Status: MISS` because `tenant-b` has its own cache scope: + +```text +HTTP/1.1 200 OK +Content-Type: application/json +Server: APISIX/3.16.0 +X-AI-Cache-Status: MISS + +{ + "id": "chatcmpl-DcCROH92JLWcgyhSpwEoutTvqnew5", + "object": "chat.completion", + "model": "gpt-4-0613", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "The capital city of Japan is Tokyo." + }, + "finish_reason": "stop" + } + ], + "usage": { "prompt_tokens": 15, "completion_tokens": 8, "total_tokens": 23 } +} +``` + +### Bypass the Cache on a Header + +The following example demonstrates how to skip the cache entirely when a request carries a specific header, for example to refresh a cached response or to support staff debugging. + + + + +```shell +curl "http://127.0.0.1:9180/apisix/admin/routes/1" -X PUT \ + -H "X-API-KEY: ${admin_key}" \ + -d '{ + "uri": "/anything", + "methods": ["POST"], + "plugins": { + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer '"$OPENAI_API_KEY"'" + } + }, + "options": { + "model": "gpt-4" + } + }, + "ai-cache": { + "layers": ["exact"], + "exact": { "ttl": 3600 }, + "bypass_on": [ + { "header": "X-Cache-Bypass", "equals": "1" } + ], + "redis_host": "127.0.0.1" + } + } + }' +``` + + + + +```yaml title="adc.yaml" +services: + - name: ai-cache-service + routes: + - name: ai-cache-route + uris: + - /anything + methods: + - POST + plugins: + ai-proxy: + provider: openai + auth: + header: + Authorization: "Bearer ${OPENAI_API_KEY}" + options: + model: gpt-4 + ai-cache: + layers: + - exact + exact: + ttl: 3600 + bypass_on: + - header: X-Cache-Bypass + equals: "1" + redis_host: 127.0.0.1 +``` + +Synchronize the configuration to the gateway: + +```shell +adc sync -f adc.yaml +``` + + + + + + + +```yaml title="ai-cache-ic.yaml" +apiVersion: apisix.apache.org/v1alpha1 +kind: PluginConfig +metadata: + namespace: aic + name: ai-cache-plugin-config +spec: + plugins: + - name: ai-proxy + config: + provider: openai + auth: + header: + Authorization: "Bearer your-api-key" + options: + model: gpt-4 + - name: ai-cache + config: + layers: + - exact + exact: + ttl: 3600 + bypass_on: + - header: X-Cache-Bypass + equals: "1" + redis_host: redis-stack +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + namespace: aic + name: ai-cache-route +spec: + parentRefs: + - name: apisix + rules: + - matches: + - path: + type: Exact + value: /anything + method: POST + filters: + - type: ExtensionRef + extensionRef: + group: apisix.apache.org + kind: PluginConfig + name: ai-cache-plugin-config +``` + + + + +```yaml title="ai-cache-ic.yaml" +apiVersion: apisix.apache.org/v2 +kind: ApisixRoute +metadata: + namespace: aic + name: ai-cache-route +spec: + ingressClassName: apisix + http: + - name: ai-cache-route + match: + paths: + - /anything + methods: + - POST + plugins: + - name: ai-proxy + enable: true + config: + provider: openai + auth: + header: + Authorization: "Bearer your-api-key" + options: + model: gpt-4 + - name: ai-cache + enable: true + config: + layers: + - exact + exact: + ttl: 3600 + bypass_on: + - header: X-Cache-Bypass + equals: "1" + redis_host: redis-stack +``` + + + + +Apply the configuration to your cluster: + +```shell +kubectl apply -f ai-cache-ic.yaml +``` + + + + +Send a request with the bypass header: + +```shell +curl -i "http://127.0.0.1:9080/anything" -X POST \ + -H "Content-Type: application/json" \ + -H "X-Cache-Bypass: 1" \ + -d '{ + "messages": [ + { "role": "user", "content": "What is the capital of France?" } + ] + }' +``` + +The request reaches OpenAI even though a cached entry exists, and the response is not written back to cache. The `X-AI-Cache-Status: BYPASS` header confirms the cache was skipped, and the response includes the original `created`, `model`, `usage`, and `system_fingerprint` fields, verifying the upstream was contacted: + +```text +HTTP/1.1 200 OK +Content-Type: application/json +Server: APISIX/3.16.0 +X-AI-Cache-Status: BYPASS + +{ + "id": "chatcmpl-Da7N4E9fA6KoQ7av98hL0zxplPCcD", + "object": "chat.completion", + "created": 1777500514, + "model": "gpt-4o-mini-2024-07-18", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "The capital of France is Paris.", + "refusal": null + }, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 14, + "completion_tokens": 7, + "total_tokens": 21 + }, + "system_fingerprint": "fp_d3214ccada" +} +``` + +## Caveats + +### The semantic-layer write is asynchronous + +After a `MISS`, the embedding fetch and Redis vector store happen in a background timer. If you send a paraphrased prompt immediately after the first request, you may see another `MISS` because the entry has not been stored yet. Wait a couple of seconds before sending a paraphrase to verify a semantic hit. + +### Similarity is mathematical, not human-judged + +Two prompts that look semantically equivalent to a human can score below the configured `similarity_threshold` and therefore miss the cache. Conversely, a small wording change can flip the result. For example, with `similarity_threshold` set to `0.85` and the cache primed with `"What is the capital of France?"`: + +| Prompt | Status | Similarity | +|--------|--------|------------| +| `capital of France?` | `HIT-L2` | `0.850` | +| `capital of France what?` | `MISS` | (below threshold) | +| `capital of France what is?` | `HIT-L2` | `0.972` | +| `capital of France what please?` | `HIT-L2` | `0.924` | +| `capital of France what is please tell me?` | `MISS` | (below threshold) | + +Lower the threshold to catch more paraphrases at the cost of occasionally serving a cached answer for a genuinely different question. Tune empirically against your traffic. + +### `BYPASS` does not refresh the cache + +A request with the bypass header reaches the upstream but its response is not written back. Use it to force a fresh upstream call without invalidating or replacing the existing cached entry. + +The bypass header is not authenticated — any client that can set the configured header and value can bypass the cache. In production, gate access using an APISIX authentication plugin such as `key-auth` or `ip-restriction`, or restrict the header at your upstream WAF. diff --git a/t/admin/plugins.t b/t/admin/plugins.t index adb98b28bc17..1454ec145eb0 100644 --- a/t/admin/plugins.t +++ b/t/admin/plugins.t @@ -98,6 +98,7 @@ ai-request-rewrite ai-prompt-guard ai-prompt-template ai-prompt-decorator +ai-cache ai-rag ai-aws-content-moderation ai-proxy-multi diff --git a/t/plugin/ai-cache-scope.t b/t/plugin/ai-cache-scope.t new file mode 100644 index 000000000000..21facdcf4688 --- /dev/null +++ b/t/plugin/ai-cache-scope.t @@ -0,0 +1,384 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +BEGIN { + $ENV{TEST_ENABLE_CONTROL_API_V1} = "0"; +} + +use t::APISIX 'no_plan'; + +log_level("info"); +repeat_each(1); +no_long_string(); +no_shuffle(); +no_root_location(); + +add_block_preprocessor(sub { + my ($block) = @_; + + if (!defined $block->request) { + $block->set_value("request", "GET /t"); + } + + if (!$block->error_log && !$block->no_error_log) { + $block->set_value("no_error_log", "[error]\n[alert]"); + } + + if (!defined $block->http_config) { + $block->set_value("http_config", <<_EOC_); +server { + listen 1990; + default_type 'application/json'; + + location /v1/embeddings { + content_by_lua_block { + local fixture_loader = require("lib.fixture_loader") + local content, err = fixture_loader.load("openai/embeddings-list.json") + if not content then + ngx.status = 500 + ngx.say(err) + return + end + + ngx.status = 200 + ngx.print(content) + } + } +} +_EOC_ + } +}); + +run_tests(); + +__DATA__ + +=== TEST 1: set up route with cache_key include_vars +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + [[{ + "uri": "/scoped", + "plugins": { + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer test-key" + } + }, + "override": { + "endpoint": "http://127.0.0.1:1980/v1/chat/completions" + } + }, + "ai-cache": { + "layers": ["exact"], + "exact": { "ttl": 60 }, + "cache_key": { + "include_vars": ["$http_x_tenant_id"] + }, + "redis_host": "127.0.0.1" + } + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- response_body +passed + + + +=== TEST 2: tenant-a first request - MISS +--- request +POST /scoped +{"messages":[{"role":"user","content":"scope test prompt"}]} +--- more_headers +Content-Type: application/json +X-Tenant-Id: tenant-a +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS + + + +=== TEST 3: tenant-b same prompt - MISS (proves cache_key partitioning) +--- request +POST /scoped +{"messages":[{"role":"user","content":"scope test prompt"}]} +--- more_headers +Content-Type: application/json +X-Tenant-Id: tenant-b +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS +--- wait: 1 + + + +=== TEST 4: tenant-a same prompt again - HIT-L1 +--- request +POST /scoped +{"messages":[{"role":"user","content":"scope test prompt"}]} +--- more_headers +Content-Type: application/json +X-Tenant-Id: tenant-a +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: HIT-L1 + + + +=== TEST 5: set up consumers for include_consumer test +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + + local consumers = { + { username = "alice", key = "alice-key" }, + { username = "bob", key = "bob-key" }, + } + + for _, c in ipairs(consumers) do + local code, body = t('/apisix/admin/consumers', + ngx.HTTP_PUT, + string.format([[{ + "username": "%s", + "plugins": { "key-auth": { "key": "%s" } } + }]], c.username, c.key) + ) + if code >= 300 then + ngx.status = code + ngx.say(body) + return + end + end + ngx.say("passed") + } + } +--- response_body +passed + + + +=== TEST 6: set up route with cache_key include_consumer + key-auth +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/2', + ngx.HTTP_PUT, + [[{ + "uri": "/per-consumer", + "plugins": { + "key-auth": {}, + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer test-key" + } + }, + "override": { + "endpoint": "http://127.0.0.1:1980/v1/chat/completions" + } + }, + "ai-cache": { + "layers": ["exact"], + "exact": { "ttl": 60 }, + "cache_key": { + "include_consumer": true + }, + "redis_host": "127.0.0.1" + } + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- response_body +passed + + + +=== TEST 7: alice first request - MISS +--- request +POST /per-consumer +{"messages":[{"role":"user","content":"per-consumer prompt"}]} +--- more_headers +Content-Type: application/json +apikey: alice-key +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS +--- wait: 1 + + + +=== TEST 8: bob same prompt - MISS (proves include_consumer partitioning) +--- request +POST /per-consumer +{"messages":[{"role":"user","content":"per-consumer prompt"}]} +--- more_headers +Content-Type: application/json +apikey: bob-key +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS +--- wait: 1 + + + +=== TEST 9: bob same prompt again - HIT-L1 (proves bob has own cache) +--- request +POST /per-consumer +{"messages":[{"role":"user","content":"per-consumer prompt"}]} +--- more_headers +Content-Type: application/json +apikey: bob-key +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: HIT-L1 + + + +=== TEST 10: set up route with L2 semantic + cache_key include_vars +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/3', + ngx.HTTP_PUT, + [[{ + "uri": "/scoped-semantic", + "plugins": { + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer test-key" + } + }, + "override": { + "endpoint": "http://127.0.0.1:1980/v1/chat/completions" + } + }, + "ai-cache": { + "layers": ["exact", "semantic"], + "exact": { "ttl": 60 }, + "semantic": { + "similarity_threshold": 0.90, + "ttl": 300, + "embedding": { + "provider": "openai", + "endpoint": "http://127.0.0.1:1990/v1/embeddings", + "api_key": "test-key" + } + }, + "cache_key": { + "include_vars": ["$http_x_tenant_id"] + }, + "redis_host": "127.0.0.1" + } + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- response_body +passed + + + +=== TEST 11: tenant-a first request - MISS, writes to L2 with scope=hash(tenant-a) +--- request +POST /scoped-semantic +{"messages":[{"role":"user","content":"What is the capital of France??"}]} +--- more_headers +Content-Type: application/json +X-Tenant-Id: tenant-a +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS +--- wait: 1 + + + +=== TEST 12: tenant-b same prompt - MISS (FT.SEARCH scope filter excludes tenant-a's entry) +--- request +POST /scoped-semantic +{"messages":[{"role":"user","content":"What is the capital of France??"}]} +--- more_headers +Content-Type: application/json +X-Tenant-Id: tenant-b +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS +--- wait: 1 + + + +=== TEST 13: tenant-a paraphrase - HIT-L2 (scope filter finds tenant-a's entry) +--- request +POST /scoped-semantic +{"messages":[{"role":"user","content":"Name the capital city of France"}]} +--- more_headers +Content-Type: application/json +X-Tenant-Id: tenant-a +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: HIT-L2 + + + +=== TEST 14: tenant-b paraphrase - HIT-L2 (proves tenant-b has own L2 entry) +--- request +POST /scoped-semantic +{"messages":[{"role":"user","content":"Name the capital city of France"}]} +--- more_headers +Content-Type: application/json +X-Tenant-Id: tenant-b +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: HIT-L2 diff --git a/t/plugin/ai-cache.t b/t/plugin/ai-cache.t new file mode 100644 index 000000000000..abc652328ac8 --- /dev/null +++ b/t/plugin/ai-cache.t @@ -0,0 +1,818 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +BEGIN { + $ENV{TEST_ENABLE_CONTROL_API_V1} = "0"; +} + +use t::APISIX 'no_plan'; + +log_level("info"); +repeat_each(1); +no_long_string(); +no_shuffle(); +no_root_location(); + +add_block_preprocessor(sub { + my ($block) = @_; + + if (!defined $block->request) { + $block->set_value("request", "GET /t"); + } + + if (!$block->error_log && !$block->no_error_log) { + $block->set_value("no_error_log", "[error]\n[alert]"); + } + + if (!defined $block->http_config) { + $block->set_value("http_config", <<_EOC_); +server { + listen 1990; + default_type 'application/json'; + + location /v1/embeddings { + content_by_lua_block { + local fixture_loader = require("lib.fixture_loader") + local content, err = fixture_loader.load("openai/embeddings-list.json") + if not content then + ngx.status = 500 + ngx.say(err) + return + end + + ngx.status = 200 + ngx.print(content) + } + } +} +_EOC_ + } +}); + +run_tests(); + +__DATA__ + +=== TEST 1: valid config - exact layer only +--- config + location /t { + content_by_lua_block { + local plugin = require("apisix.plugins.ai-cache") + local ok, err = plugin.check_schema({ + layers = { "exact" }, + exact = { ttl = 600 }, + redis_host = "127.0.0.1", + redis_port = 6379, + }) + + if not ok then + ngx.say("failed") + else + ngx.say("passed") + end + } + } +--- response_body +passed + + + +=== TEST 2: valid config - both layers with semantic embedding +--- config + location /t { + content_by_lua_block { + local plugin = require("apisix.plugins.ai-cache") + local ok, err = plugin.check_schema({ + layers = { "exact", "semantic" }, + exact = { ttl = 3600 }, + semantic = { + similarity_threshold = 0.95, + ttl = 86400, + embedding = { + provider = "openai", + endpoint = "https://api.openai.com/v1/embeddings", + api_key = "sk-test", + }, + }, + redis_host = "127.0.0.1", + redis_port = 6379, + }) + + if not ok then + ngx.say(err) + else + ngx.say("passed") + end + } + } +--- response_body +passed + + + +=== TEST 3: semantic without embedding config - should fail +--- config + location /t { + content_by_lua_block { + local plugin = require("apisix.plugins.ai-cache") + local ok, err = plugin.check_schema({ + layers = { "semantic" }, + redis_host = "127.0.0.1", + }) + + if not ok then + ngx.say("failed: ", err) + else + ngx.say("passed") + end + } + } +--- response_body +failed: semantic layer requires semantic.embedding to be configured + + + +=== TEST 4: invalid layer value - should fail +--- config + location /t { + content_by_lua_block { + local plugin = require("apisix.plugins.ai-cache") + local ok, err = plugin.check_schema({ + layers = { "invalid_layer" }, + }) + + if not ok then + ngx.say(err) + else + ngx.say("passed") + end + } + } +--- response_body eval +qr/.*property "layers" validation failed:.*matches none of the enum values.*/ + + + +=== TEST 5: unsupported embedding provider - should fail +--- config + location /t { + content_by_lua_block { + local plugin = require("apisix.plugins.ai-cache") + local ok, err = plugin.check_schema({ + layers = { "semantic" }, + semantic = { + embedding = { + provider = "some-unknown-provider", + endpoint = "https://example.com/embeddings", + api_key = "key", + }, + }, + }) + + if not ok then + ngx.say(err) + else + ngx.say("passed") + end + } + } +--- response_body eval +qr/.*property "provider" validation failed: matches none of the enum values.*/ + + + +=== TEST 6: similarity_threshold out of range - should fail +--- config + location /t { + content_by_lua_block { + local plugin = require("apisix.plugins.ai-cache") + local ok, err = plugin.check_schema({ + layers = { "semantic" }, + semantic = { + similarity_threshold = 1.5, + embedding = { + provider = "openai", + endpoint = "https://api.openai.com/v1/embeddings", + api_key = "sk-test", + }, + }, + }) + + if not ok then + ngx.say(err) + else + ngx.say("passed") + end + } + } +--- response_body eval +qr/.*property "similarity_threshold" validation failed: expected 1\.5 to be at most.*/ + + + +=== TEST 7: layers empty array - should fail (minItems=1) +--- config + location /t { + content_by_lua_block { + local plugin = require("apisix.plugins.ai-cache") + local ok, err = plugin.check_schema({ + layers = {}, + redis_host = "127.0.0.1", + }) + + if not ok then + ngx.say(err) + else + ngx.say("passed") + end + } + } +--- response_body eval +qr/.*property "layers" validation failed: expect array to have at least 1 items.*/ + + + +=== TEST 8: set up route for L1 cache tests +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + [[{ + "uri": "/exact", + "plugins": { + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer test-key" + } + }, + "override": { + "endpoint": "http://127.0.0.1:1980/v1/chat/completions" + } + }, + "ai-cache": { + "layers": ["exact"], + "exact": { "ttl": 60 }, + "redis_host": "127.0.0.1", + "bypass_on": [{"header": "X-Cache-Bypass", "equals": "1"}] + } + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- response_body +passed + + + +=== TEST 9: first request - cache MISS, upstream called +--- request +POST /exact +{"messages":[{"role":"user","content":"What is the answer to life?"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS +--- response_body_like eval +qr/\{ "content": "1 \+ 1 = 2\.", "role": "assistant" \}/ + + + +=== TEST 10: second identical request - cache HIT-L1, no upstream call +--- request +POST /exact +{"messages":[{"role":"user","content":"What is the answer to life?"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: HIT-L1 +--- response_headers_like +X-AI-Cache-Age: \d+ +--- response_body_like eval +qr/"content":\s?"1 \+ 1 = 2\."/ +--- error_log +ai-cache: L1 hit for key + + + +=== TEST 11: bypass header - BYPASS, upstream called, not cached +--- request +POST /exact +{"messages":[{"role":"user","content":"What is the bypass question?"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +X-Cache-Bypass: 1 +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: BYPASS + + + +=== TEST 12: same prompt without bypass after bypass - still MISS (bypass did not cache) +--- request +POST /exact +{"messages":[{"role":"user","content":"What is the bypass question?"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS + + + +=== TEST 13: set up route with two bypass rules +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + [[{ + "uri": "/exact", + "plugins": { + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer test-key" + } + }, + "override": { + "endpoint": "http://127.0.0.1:1980/v1/chat/completions" + } + }, + "ai-cache": { + "layers": ["exact"], + "exact": { "ttl": 60 }, + "redis_host": "127.0.0.1", + "bypass_on": [ + {"header": "X-Cache-Bypass", "equals": "1"}, + {"header": "X-Debug", "equals": "true"} + ] + } + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- response_body +passed + + + +=== TEST 14: first bypass rule matches - BYPASS +--- request +POST /exact +{"messages":[{"role":"user","content":"multi-rule bypass test"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +X-Cache-Bypass: 1 +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: BYPASS + + + +=== TEST 15: second bypass rule matches - BYPASS +--- request +POST /exact +{"messages":[{"role":"user","content":"multi-rule bypass test"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +X-Debug: true +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: BYPASS + + + +=== TEST 16: set up route for upstream-status filter tests +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/2', + ngx.HTTP_PUT, + [[{ + "uri": "/error", + "plugins": { + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer test-key" + } + }, + "override": { + "endpoint": "http://127.0.0.1:1980/v1/chat/completions" + } + }, + "ai-cache": { + "layers": ["exact"], + "exact": { "ttl": 60 }, + "redis_host": "127.0.0.1" + } + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- response_body +passed + + + +=== TEST 17: non-2xx upstream response - not cached (status code filter) +--- request +POST /error +{"messages":[{"role":"user","content":"trigger a server error"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +X-AI-Fixture-Status: 500 +--- error_code: 500 +--- response_headers +X-AI-Cache-Status: MISS + + + +=== TEST 18: same prompt after non-2xx - still MISS (was not cached) +--- request +POST /error +{"messages":[{"role":"user","content":"trigger a server error"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +X-AI-Fixture-Status: 500 +--- error_code: 500 +--- response_headers +X-AI-Cache-Status: MISS + + + +=== TEST 19: set up route with very small max_cache_body_size +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/3', + ngx.HTTP_PUT, + [[{ + "uri": "/tiny", + "plugins": { + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer test-key" + } + }, + "override": { + "endpoint": "http://127.0.0.1:1980/v1/chat/completions" + } + }, + "ai-cache": { + "layers": ["exact"], + "exact": { "ttl": 60 }, + "max_cache_body_size": 5, + "redis_host": "127.0.0.1" + } + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- response_body +passed + + + +=== TEST 20: oversize response - MISS, log warns and skips cache write +--- request +POST /tiny +{"messages":[{"role":"user","content":"oversize body test"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS +--- response_body_like eval +qr/\{ "content": "1 \+ 1 = 2\.", "role": "assistant" \}/ +--- error_log +exceeds max_cache_body_size + + + +=== TEST 21: same prompt after oversize - still MISS (was not cached) +--- request +POST /tiny +{"messages":[{"role":"user","content":"oversize body test"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS +--- response_body_like eval +qr/\{ "content": "1 \+ 1 = 2\.", "role": "assistant" \}/ +--- error_log +exceeds max_cache_body_size + + + +=== TEST 22: set up route with custom cache header names +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/4', + ngx.HTTP_PUT, + [[{ + "uri": "/custom-headers", + "plugins": { + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer test-key" + } + }, + "override": { + "endpoint": "http://127.0.0.1:1980/v1/chat/completions" + } + }, + "ai-cache": { + "layers": ["exact"], + "exact": { "ttl": 60 }, + "headers": { + "cache_status": "X-Custom-Status", + "cache_age": "X-Custom-Age" + }, + "redis_host": "127.0.0.1" + } + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- response_body +passed + + + +=== TEST 23: MISS populates the cache and emits custom status header +--- request +POST /custom-headers +{"messages":[{"role":"user","content":"custom header test"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-Custom-Status: MISS +--- response_body_like eval +qr/\{ "content": "1 \+ 1 = 2\.", "role": "assistant" \}/ +--- wait: 1 + + + +=== TEST 24: HIT emits custom status and age headers (defaults not used) +--- request +POST /custom-headers +{"messages":[{"role":"user","content":"custom header test"}]} +--- more_headers +Content-Type: application/json +--- error_code: 200 +--- response_headers +X-Custom-Status: HIT-L1 +X-AI-Cache-Status: +X-AI-Cache-Age: +--- response_headers_like +X-Custom-Age: \d+ +--- response_body_like eval +qr/"content":\s?"1 \+ 1 = 2\."/ + + + +=== TEST 25: clean up Redis cache state before semantic tests +--- config + location /t { + content_by_lua_block { + local redis = require("resty.redis") + local red = redis:new() + red:set_timeout(1000) + assert(red:connect("127.0.0.1", 6379)) + + red["FT.DROPINDEX"](red, "ai-cache-idx-3", "DD") + + local keys = red:keys("ai-cache:*") + if type(keys) == "table" and #keys > 0 then + red:del(unpack(keys)) + end + + red:close() + ngx.say("ok") + } + } +--- response_body +ok + + + +=== TEST 26: set up route for L2 semantic cache tests +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/5', + ngx.HTTP_PUT, + [[{ + "uri": "/semantic", + "plugins": { + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer test-key" + } + }, + "override": { + "endpoint": "http://127.0.0.1:1980/v1/chat/completions" + } + }, + "ai-cache": { + "layers": ["exact", "semantic"], + "exact": { + "ttl": 60 + }, + "semantic": { + "similarity_threshold": 0.90, + "ttl": 300, + "embedding": { + "provider": "openai", + "endpoint": "http://127.0.0.1:1990/v1/embeddings", + "api_key": "test-key" + } + }, + "redis_host": "127.0.0.1" + } + } + }]] + ) + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- response_body +passed + + + +=== TEST 27: L2 - first request, cache MISS, stored in L2 +--- request +POST /semantic +{"messages":[{"role":"user","content":"What is the capital of France??"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS +--- response_body_like eval +qr/\{ "content": "1 \+ 1 = 2\.", "role": "assistant" \}/ + + + +=== TEST 28: L2 - different wording hits L2 (same vector from fixture) +--- request +POST /semantic +{"messages":[{"role":"user","content":"Name the capital city of France"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: HIT-L2 +--- response_headers_like +X-AI-Cache-Similarity: \d+(\.\d+)? +--- response_body_like eval +qr/"content":\s?"1 \+ 1 = 2\."/ +--- error_log +ai-cache: L2 hit + + + +=== TEST 29: L2 - paraphrase now hits L1 (backfilled by the previous L2 hit) +--- request +POST /semantic +{"messages":[{"role":"user","content":"Name the capital city of France"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: HIT-L1 +--- response_body_like eval +qr/"content":\s?"1 \+ 1 = 2\."/ +--- error_log +ai-cache: L1 hit for key + + + +=== TEST 30: streaming MISS - upstream called, response cached via log phase +--- request +POST /exact +{"messages":[{"role":"user","content":"Stream me something cool"}],"stream":true} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-streaming.sse +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS +--- response_body_like eval +qr/data:.*"content":"Hello"/ + + + +=== TEST 31: streaming HIT - Content-Type is text/event-stream, SSE body returned +--- request +POST /exact +{"messages":[{"role":"user","content":"Stream me something cool"}],"stream":true} +--- more_headers +Content-Type: application/json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: HIT-L1 +Content-Type: text/event-stream +--- response_body_like eval +qr/data:.*"content":\s?"Hello!"/ +--- wait: 1 + + + +=== TEST 32: non-streaming HIT after streaming MISS - returns JSON +--- request +POST /exact +{"messages":[{"role":"user","content":"Stream me something cool"}]} +--- more_headers +Content-Type: application/json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: HIT-L1 +Content-Type: application/json +--- response_body_like eval +qr/"content":\s?"Hello!"/ diff --git a/t/plugin/prometheus-ai-cache.t b/t/plugin/prometheus-ai-cache.t new file mode 100644 index 000000000000..3af1a6ae2491 --- /dev/null +++ b/t/plugin/prometheus-ai-cache.t @@ -0,0 +1,481 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +BEGIN { + $ENV{TEST_ENABLE_CONTROL_API_V1} = "0"; + + if ($ENV{TEST_NGINX_CHECK_LEAK}) { + $SkipReason = "unavailable for the hup tests"; + } else { + $ENV{TEST_NGINX_USE_HUP} = 1; + undef $ENV{TEST_NGINX_USE_STAP}; + } +} + +use t::APISIX 'no_plan'; + +repeat_each(1); +no_long_string(); +no_shuffle(); +no_root_location(); + +add_block_preprocessor(sub { + my ($block) = @_; + + if (!defined $block->request) { + $block->set_value("request", "GET /t"); + } + + my $user_yaml_config = <<_EOC_; +plugin_attr: + prometheus: + refresh_interval: 0.1 +plugins: + - ai-proxy + - ai-cache + - prometheus + - public-api + - key-auth +_EOC_ + $block->set_value("extra_yaml_config", $user_yaml_config); + + if (!defined $block->http_config) { + $block->set_value("http_config", <<_EOC_); +server { + listen 1990; + default_type 'application/json'; + + location /v1/embeddings { + content_by_lua_block { + local fixture_loader = require("lib.fixture_loader") + local content, err = fixture_loader.load("openai/embeddings-list.json") + if not content then + ngx.status = 500 + ngx.say(err) + return + end + + ngx.status = 200 + ngx.print(content) + } + } + + location /v1/embeddings-fail { + content_by_lua_block { + ngx.status = 500 + ngx.say('{"error":"simulated embedding failure"}') + } + } +} +_EOC_ + } +}); + +run_tests; + +__DATA__ + +=== TEST 1: set up routes +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + + local routes = { + { + url = "/apisix/admin/routes/1", + data = [[{ + "uri": "/exact", + "plugins": { + "prometheus": {}, + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer test-key" + } + }, + "override": { + "endpoint": "http://127.0.0.1:1980/v1/chat/completions" + } + }, + "ai-cache": { + "layers": ["exact"], + "exact": { "ttl": 60 }, + "redis_host": "127.0.0.1", + "bypass_on": [{"header": "X-Cache-Bypass", "equals": "1"}] + } + } + }]], + }, + { + url = "/apisix/admin/routes/2", + data = [[{ + "uri": "/semantic", + "plugins": { + "prometheus": {}, + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer test-key" + } + }, + "override": { + "endpoint": "http://127.0.0.1:1980/v1/chat/completions" + } + }, + "ai-cache": { + "layers": ["exact", "semantic"], + "exact": { "ttl": 60 }, + "semantic": { + "similarity_threshold": 0.90, + "ttl": 300, + "embedding": { + "provider": "openai", + "endpoint": "http://127.0.0.1:1990/v1/embeddings", + "api_key": "test-key" + } + }, + "redis_host": "127.0.0.1" + } + } + }]], + }, + { + url = "/apisix/admin/routes/3", + data = [[{ + "uri": "/semantic-fail", + "plugins": { + "prometheus": {}, + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer test-key" + } + }, + "override": { + "endpoint": "http://127.0.0.1:1980/v1/chat/completions" + } + }, + "ai-cache": { + "layers": ["semantic"], + "semantic": { + "similarity_threshold": 0.90, + "ttl": 300, + "embedding": { + "provider": "openai", + "endpoint": "http://127.0.0.1:1990/v1/embeddings-fail", + "api_key": "test-key" + } + }, + "redis_host": "127.0.0.1" + } + } + }]], + }, + { + url = "/apisix/admin/routes/4", + data = [[{ + "uri": "/exact-auth", + "plugins": { + "prometheus": {}, + "key-auth": {}, + "ai-proxy": { + "provider": "openai", + "auth": { + "header": { + "Authorization": "Bearer test-key" + } + }, + "override": { + "endpoint": "http://127.0.0.1:1980/v1/chat/completions" + } + }, + "ai-cache": { + "layers": ["exact"], + "exact": { "ttl": 60 }, + "redis_host": "127.0.0.1" + } + } + }]], + }, + { + url = "/apisix/admin/consumers", + data = [[{ + "username": "alice", + "plugins": { + "key-auth": { + "key": "alice-key" + } + } + }]], + }, + { + url = "/apisix/admin/routes/metrics", + data = [[{ + "plugins": { + "public-api": {} + }, + "uri": "/apisix/prometheus/metrics" + }]], + }, + } + + for _, route in ipairs(routes) do + local code, body = t(route.url, ngx.HTTP_PUT, route.data) + if code >= 300 then + ngx.status = code + end + ngx.say(body) + end + } + } +--- response_body eval +"passed\n" x 6 + + + +=== TEST 2: MISS request - upstream called +--- request +POST /exact +{"messages":[{"role":"user","content":"What is the meaning of life?"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS + + + +=== TEST 3: same request - HIT-L1 +--- request +POST /exact +{"messages":[{"role":"user","content":"What is the meaning of life?"}]} +--- more_headers +Content-Type: application/json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: HIT-L1 +--- wait: 1 + + + +=== TEST 4: verify miss counter +--- request +GET /apisix/prometheus/metrics +--- response_body_like eval +qr/apisix_ai_cache_misses_total\{route_id="1",service_id="",consumer=""\} 1/ + + + +=== TEST 5: verify hit counter with layer label +--- request +GET /apisix/prometheus/metrics +--- response_body_like eval +qr/apisix_ai_cache_hits_total\{route_id="1",service_id="",consumer="",layer="l1"\} 1/ + + + +=== TEST 6: BYPASS request - upstream called, no cache interaction +--- request +POST /exact +{"messages":[{"role":"user","content":"What is the meaning of life?"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +X-Cache-Bypass: 1 +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: BYPASS + + + +=== TEST 7: verify BYPASS did not increment misses counter +--- request +GET /apisix/prometheus/metrics +--- response_body_like eval +qr/apisix_ai_cache_misses_total\{route_id="1",service_id="",consumer=""\} 1\n/ + + + +=== TEST 8: verify BYPASS did not increment hits counter +--- request +GET /apisix/prometheus/metrics +--- response_body_like eval +qr/apisix_ai_cache_hits_total\{route_id="1",service_id="",consumer="",layer="l1"\} 1\n/ + + + +=== TEST 9: cleanup Redis cache state before semantic tests +--- config + location /t { + content_by_lua_block { + local redis = require("resty.redis") + local red = redis:new() + red:set_timeout(1000) + assert(red:connect("127.0.0.1", 6379)) + + red["FT.DROPINDEX"](red, "ai-cache-idx-3", "DD") + + local keys = red:keys("ai-cache:*") + if type(keys) == "table" and #keys > 0 then + red:del(unpack(keys)) + end + + red:close() + ngx.say("ok") + } + } +--- response_body +ok + + + +=== TEST 10: L2 first request - MISS, embedding API called +--- request +POST /semantic +{"messages":[{"role":"user","content":"What is the capital of France??"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS +--- wait: 1 + + + +=== TEST 11: L2 second request - different wording, HIT-L2 +--- request +POST /semantic +{"messages":[{"role":"user","content":"Name the capital city of France"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: HIT-L2 +--- wait: 1 + + + +=== TEST 12: verify miss counter for semantic route (route_id=2) +--- request +GET /apisix/prometheus/metrics +--- response_body_like eval +qr/apisix_ai_cache_misses_total\{route_id="2",service_id="",consumer=""\} 1/ + + + +=== TEST 13: verify hits counter with layer="l2" +--- request +GET /apisix/prometheus/metrics +--- response_body_like eval +qr/apisix_ai_cache_hits_total\{route_id="2",service_id="",consumer="",layer="l2"\} 1/ + + + +=== TEST 14: verify embedding latency histogram with provider label +--- request +GET /apisix/prometheus/metrics +--- response_body_like eval +qr/apisix_ai_cache_embedding_latency_count\{route_id="2",service_id="",consumer="",provider="openai"\} 2/ + + + +=== TEST 15: embedding failure - request still returns 200 via fallback +--- request +POST /semantic-fail +{"messages":[{"role":"user","content":"What does this fail at?"}]} +--- more_headers +Content-Type: application/json +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS +--- wait: 1 + + + +=== TEST 16: verify embedding_failures counter +--- request +GET /apisix/prometheus/metrics +--- response_body_like eval +qr/apisix_ai_cache_embedding_failures_total\{route_id="3",service_id="",consumer=""\} 1/ + + + +=== TEST 17: verify embedding-failure request also counted as miss +--- request +GET /apisix/prometheus/metrics +--- response_body_like eval +qr/apisix_ai_cache_misses_total\{route_id="3",service_id="",consumer=""\} 1/ + + + +=== TEST 18: authenticated MISS request - consumer alice +--- request +POST /exact-auth +{"messages":[{"role":"user","content":"Authenticated cache test"}]} +--- more_headers +Content-Type: application/json +apikey: alice-key +X-AI-Fixture: openai/chat-basic.json +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: MISS + + + +=== TEST 19: authenticated HIT-L1 request - consumer alice +--- request +POST /exact-auth +{"messages":[{"role":"user","content":"Authenticated cache test"}]} +--- more_headers +Content-Type: application/json +apikey: alice-key +--- error_code: 200 +--- response_headers +X-AI-Cache-Status: HIT-L1 +--- wait: 1 + + + +=== TEST 20: verify consumer label is populated on hits counter +--- request +GET /apisix/prometheus/metrics +--- response_body_like eval +qr/apisix_ai_cache_hits_total\{route_id="4",service_id="",consumer="alice",layer="l1"\} 1/ + + + +=== TEST 21: verify consumer label is populated on misses counter +--- request +GET /apisix/prometheus/metrics +--- response_body_like eval +qr/apisix_ai_cache_misses_total\{route_id="4",service_id="",consumer="alice"\} 1/ + + + +=== TEST 22: verify cache hit is labelled as ai_chat (not traditional_http) +--- request +GET /apisix/prometheus/metrics +--- response_body_like eval +qr/apisix_http_status\{code="200",route="1"[^}]*request_type="ai_chat"[^}]*response_source="apisix"[^}]*\} 1/