diff --git a/quickwit/quickwit-proto/protos/quickwit/search.proto b/quickwit/quickwit-proto/protos/quickwit/search.proto index 04fa0cedf2e..8b0255ea027 100644 --- a/quickwit/quickwit-proto/protos/quickwit/search.proto +++ b/quickwit/quickwit-proto/protos/quickwit/search.proto @@ -125,6 +125,10 @@ message ListFieldsRequest { optional int64 start_timestamp = 3; optional int64 end_timestamp = 4; + // JSON-serialized QueryAst for index_filter support. + // When provided, only fields from documents matching this query are returned. + optional string query_ast = 5; + // Control if the request will fail if split_ids contains a split that does not exist. // optional bool fail_on_missing_index = 6; } @@ -141,7 +145,6 @@ message LeafListFieldsRequest { // Optional limit query to a list of fields // Wildcard expressions are supported. repeated string fields = 4; - } message ListFieldsResponse { diff --git a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs index e1201ce7a0e..6c0372e37f6 100644 --- a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs +++ b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs @@ -70,6 +70,10 @@ pub struct ListFieldsRequest { pub start_timestamp: ::core::option::Option, #[prost(int64, optional, tag = "4")] pub end_timestamp: ::core::option::Option, + /// JSON-serialized QueryAst for index_filter support. + /// When provided, only fields from documents matching this query are returned. + #[prost(string, optional, tag = "5")] + pub query_ast: ::core::option::Option<::prost::alloc::string::String>, } #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] #[derive(Clone, PartialEq, ::prost::Message)] diff --git a/quickwit/quickwit-search/src/list_fields.rs b/quickwit/quickwit-search/src/list_fields.rs index f4cf173fe08..b5974867cfd 100644 --- a/quickwit/quickwit-search/src/list_fields.rs +++ b/quickwit/quickwit-search/src/list_fields.rs @@ -24,6 +24,8 @@ use itertools::Itertools; use quickwit_common::rate_limited_warn; use quickwit_common::shared_consts::{FIELD_PRESENCE_FIELD_NAME, SPLIT_FIELDS_FILE_NAME}; use quickwit_common::uri::Uri; +use quickwit_config::build_doc_mapper; +use quickwit_doc_mapper::tag_pruning::extract_tags_from_query; use quickwit_metastore::SplitMetadata; use quickwit_proto::metastore::MetastoreServiceClient; use quickwit_proto::search::{ @@ -31,6 +33,7 @@ use quickwit_proto::search::{ ListFieldsResponse, SplitIdAndFooterOffsets, deserialize_split_fields, }; use quickwit_proto::types::{IndexId, IndexUid}; +use quickwit_query::query_ast::QueryAst; use quickwit_storage::Storage; use crate::leaf::open_split_bundle; @@ -310,6 +313,8 @@ impl FieldPattern { } /// `leaf` step of list fields. +/// +/// Returns field metadata from the assigned splits. pub async fn leaf_list_fields( index_id: IndexId, index_storage: Arc, @@ -322,6 +327,12 @@ pub async fn leaf_list_fields( .map(|pattern_str| FieldPattern::from_str(pattern_str)) .collect::>()?; + // If no splits, return empty response + if split_ids.is_empty() { + return Ok(ListFieldsResponse { fields: Vec::new() }); + } + + // Get fields from all splits let single_split_list_fields_futures: Vec<_> = split_ids .iter() .map(|split_id| { @@ -375,7 +386,7 @@ pub async fn leaf_list_fields( Ok(ListFieldsResponse { fields }) } -/// Index metas needed for executing a leaf search request. +/// Index metas needed for executing a leaf list fields request. #[derive(Clone, Debug)] pub struct IndexMetasForLeafSearch { /// Index id. @@ -399,29 +410,63 @@ pub async fn root_list_fields( if indexes_metadata.is_empty() { return Ok(ListFieldsResponse { fields: Vec::new() }); } - let index_uid_to_index_meta: HashMap = indexes_metadata - .iter() - .map(|index_metadata| { - let index_metadata_for_leaf_search = IndexMetasForLeafSearch { - index_uri: index_metadata.index_uri().clone(), - index_id: index_metadata.index_config.index_id.to_string(), - }; - - ( - index_metadata.index_uid.clone(), - index_metadata_for_leaf_search, + + // Build index metadata map and extract timestamp field for time range refinement + let mut index_uid_to_index_meta: HashMap = HashMap::new(); + let mut index_uids: Vec = Vec::new(); + let mut timestamp_field_opt: Option = None; + + for index_metadata in indexes_metadata { + // Extract timestamp field for time range refinement (use first index's field) + if timestamp_field_opt.is_none() + && list_fields_req.query_ast.is_some() + && let Ok(doc_mapper) = build_doc_mapper( + &index_metadata.index_config.doc_mapping, + &index_metadata.index_config.search_settings, ) - }) - .collect(); - let index_uids: Vec = indexes_metadata - .into_iter() - .map(|index_metadata| index_metadata.index_uid) - .collect(); + { + timestamp_field_opt = doc_mapper.timestamp_field_name().map(|s| s.to_string()); + } + + let index_metadata_for_leaf_search = IndexMetasForLeafSearch { + index_uri: index_metadata.index_uri().clone(), + index_id: index_metadata.index_config.index_id.to_string(), + }; + + index_uids.push(index_metadata.index_uid.clone()); + index_uid_to_index_meta.insert( + index_metadata.index_uid.clone(), + index_metadata_for_leaf_search, + ); + } + + // Extract tags and refine time range from query_ast for split pruning + let mut start_timestamp = list_fields_req.start_timestamp; + let mut end_timestamp = list_fields_req.end_timestamp; + let tags_filter_opt = if let Some(ref query_ast_json) = list_fields_req.query_ast { + let query_ast: QueryAst = serde_json::from_str(query_ast_json) + .map_err(|err| SearchError::InvalidQuery(err.to_string()))?; + + // Refine time range from query AST if timestamp field is available + if let Some(ref timestamp_field) = timestamp_field_opt { + crate::root::refine_start_end_timestamp_from_ast( + &query_ast, + timestamp_field, + &mut start_timestamp, + &mut end_timestamp, + ); + } + + extract_tags_from_query(query_ast) + } else { + None + }; + let split_metadatas: Vec = list_relevant_splits( index_uids, - list_fields_req.start_timestamp, - list_fields_req.end_timestamp, - None, + start_timestamp, + end_timestamp, + tags_filter_opt, &mut metastore, ) .await?; diff --git a/quickwit/quickwit-serve/src/elasticsearch_api/model/field_capability.rs b/quickwit/quickwit-serve/src/elasticsearch_api/model/field_capability.rs index a382c541dc7..9aefdc83762 100644 --- a/quickwit/quickwit-serve/src/elasticsearch_api/model/field_capability.rs +++ b/quickwit/quickwit-serve/src/elasticsearch_api/model/field_capability.rs @@ -15,7 +15,10 @@ use std::collections::HashMap; use quickwit_proto::search::{ListFieldType, ListFieldsEntryResponse, ListFieldsResponse}; +use quickwit_query::ElasticQueryDsl; +use quickwit_query::query_ast::QueryAst; use serde::{Deserialize, Serialize}; +use warp::hyper::StatusCode; use super::ElasticsearchError; use super::search_query_params::*; @@ -173,16 +176,227 @@ pub fn convert_to_es_field_capabilities_response( FieldCapabilityResponse { indices, fields } } +/// Parses an Elasticsearch index_filter JSON value into a Quickwit QueryAst. +/// +/// Returns `Ok(None)` if the index_filter is null. +/// Returns `Ok(Some(QueryAst))` if the index_filter is valid. +/// Returns `Err` if the index_filter is invalid or cannot be converted (including empty object). +#[allow(clippy::result_large_err)] +pub fn parse_index_filter_to_query_ast( + index_filter: serde_json::Value, +) -> Result, ElasticsearchError> { + if index_filter.is_null() { + return Ok(None); + } + + // Parse ES Query DSL to internal QueryAst + let elastic_query_dsl: ElasticQueryDsl = + serde_json::from_value(index_filter).map_err(|err| { + ElasticsearchError::new( + StatusCode::BAD_REQUEST, + format!("Invalid index_filter: {err}"), + None, + ) + })?; + + let query_ast: QueryAst = elastic_query_dsl.try_into().map_err(|err: anyhow::Error| { + ElasticsearchError::new( + StatusCode::BAD_REQUEST, + format!("Failed to convert index_filter: {err}"), + None, + ) + })?; + + Ok(Some(query_ast)) +} + #[allow(clippy::result_large_err)] pub fn build_list_field_request_for_es_api( index_id_patterns: Vec, search_params: FieldCapabilityQueryParams, - _search_body: FieldCapabilityRequestBody, + search_body: FieldCapabilityRequestBody, ) -> Result { + let query_ast = parse_index_filter_to_query_ast(search_body.index_filter)?; + let query_ast_json = query_ast + .map(|ast| serde_json::to_string(&ast).expect("QueryAst should be JSON serializable")); + Ok(quickwit_proto::search::ListFieldsRequest { index_id_patterns, fields: search_params.fields.unwrap_or_default(), start_timestamp: search_params.start_timestamp, end_timestamp: search_params.end_timestamp, + query_ast: query_ast_json, }) } + +#[cfg(test)] +mod tests { + use serde_json::json; + + use super::*; + + #[test] + fn test_build_list_field_request_empty_index_filter() { + let result = build_list_field_request_for_es_api( + vec!["test_index".to_string()], + FieldCapabilityQueryParams::default(), + FieldCapabilityRequestBody::default(), + ) + .unwrap(); + + assert_eq!(result.index_id_patterns, vec!["test_index".to_string()]); + assert!(result.query_ast.is_none()); + } + + #[test] + fn test_build_list_field_request_with_term_index_filter() { + let search_body = FieldCapabilityRequestBody { + index_filter: json!({ + "term": { + "status": "active" + } + }), + runtime_mappings: serde_json::Value::Null, + }; + + let result = build_list_field_request_for_es_api( + vec!["test_index".to_string()], + FieldCapabilityQueryParams::default(), + search_body, + ) + .unwrap(); + + assert_eq!(result.index_id_patterns, vec!["test_index".to_string()]); + assert!(result.query_ast.is_some()); + + // Verify the query_ast is valid JSON + let query_ast: serde_json::Value = + serde_json::from_str(&result.query_ast.unwrap()).unwrap(); + assert!(query_ast.is_object()); + } + + #[test] + fn test_build_list_field_request_with_bool_index_filter() { + let search_body = FieldCapabilityRequestBody { + index_filter: json!({ + "bool": { + "must": [ + { "term": { "status": "active" } } + ], + "filter": [ + { "range": { "age": { "gte": 18 } } } + ] + } + }), + runtime_mappings: serde_json::Value::Null, + }; + + let result = build_list_field_request_for_es_api( + vec!["test_index".to_string()], + FieldCapabilityQueryParams::default(), + search_body, + ) + .unwrap(); + + assert!(result.query_ast.is_some()); + } + + #[test] + fn test_build_list_field_request_with_invalid_index_filter() { + let search_body = FieldCapabilityRequestBody { + index_filter: json!({ + "invalid_query_type": { + "field": "value" + } + }), + runtime_mappings: serde_json::Value::Null, + }; + + let result = build_list_field_request_for_es_api( + vec!["test_index".to_string()], + FieldCapabilityQueryParams::default(), + search_body, + ); + + assert!(result.is_err()); + let err = result.unwrap_err(); + assert_eq!(err.status, StatusCode::BAD_REQUEST); + } + + #[test] + fn test_build_list_field_request_with_null_index_filter() { + let search_body = FieldCapabilityRequestBody { + index_filter: serde_json::Value::Null, + runtime_mappings: serde_json::Value::Null, + }; + + let result = build_list_field_request_for_es_api( + vec!["test_index".to_string()], + FieldCapabilityQueryParams::default(), + search_body, + ) + .unwrap(); + + assert!(result.query_ast.is_none()); + } + + #[test] + fn test_build_list_field_request_preserves_other_params() { + let search_params = FieldCapabilityQueryParams { + fields: Some(vec!["field1".to_string(), "field2".to_string()]), + start_timestamp: Some(1000), + end_timestamp: Some(2000), + ..Default::default() + }; + + let search_body = FieldCapabilityRequestBody { + index_filter: json!({ "match_all": {} }), + runtime_mappings: serde_json::Value::Null, + }; + + let result = build_list_field_request_for_es_api( + vec!["test_index".to_string()], + search_params, + search_body, + ) + .unwrap(); + + assert_eq!( + result.fields, + vec!["field1".to_string(), "field2".to_string()] + ); + assert_eq!(result.start_timestamp, Some(1000)); + assert_eq!(result.end_timestamp, Some(2000)); + assert!(result.query_ast.is_some()); + } + + #[test] + fn test_parse_index_filter_to_query_ast_null() { + let result = parse_index_filter_to_query_ast(serde_json::Value::Null).unwrap(); + assert!(result.is_none()); + } + + #[test] + fn test_parse_index_filter_to_query_ast_empty_object() { + // Empty object {} should return error to match ES behavior + let result = parse_index_filter_to_query_ast(json!({})); + assert!(result.is_err()); + } + + #[test] + fn test_parse_index_filter_to_query_ast_valid_term() { + let result = parse_index_filter_to_query_ast(json!({ + "term": { "status": "active" } + })) + .unwrap(); + assert!(result.is_some()); + } + + #[test] + fn test_parse_index_filter_to_query_ast_invalid() { + let result = parse_index_filter_to_query_ast(json!({ + "invalid_query_type": { "field": "value" } + })); + assert!(result.is_err()); + } +} diff --git a/quickwit/rest-api-tests/scenarii/es_field_capabilities/0001-field-capabilities.yaml b/quickwit/rest-api-tests/scenarii/es_field_capabilities/0001-field-capabilities.yaml index bd3cd917acd..79accac5be3 100644 --- a/quickwit/rest-api-tests/scenarii/es_field_capabilities/0001-field-capabilities.yaml +++ b/quickwit/rest-api-tests/scenarii/es_field_capabilities/0001-field-capabilities.yaml @@ -334,3 +334,140 @@ engines: - elasticsearch endpoint: doesno*texist/_field_caps?fields=date status_code: 200 +--- +# Test _field_caps API with index_filter (term query) +# Note: term queries require exact token match; 'fritz' is lowercase due to default tokenizer +method: [POST] +engines: + - quickwit + - elasticsearch +endpoint: fieldcaps/_field_caps?fields=* +json: + index_filter: + term: + name: "fritz" +expected: + indices: + - fieldcaps + fields: + name: + keyword: + type: keyword + metadata_field: false + searchable: true + aggregatable: true + text: + type: text + metadata_field: false + searchable: true + aggregatable: true +--- +# Test _field_caps API with index_filter (match_all query) +method: [POST] +engines: + - quickwit + - elasticsearch +endpoint: fieldcaps/_field_caps?fields=name +json: + index_filter: + match_all: {} +expected: + indices: + - fieldcaps + fields: + name: + keyword: + type: keyword + metadata_field: false + searchable: true + aggregatable: true + text: + type: text + metadata_field: false + searchable: true + aggregatable: true +--- +# Test _field_caps API with index_filter (bool query) +method: [POST] +engines: + - quickwit + - elasticsearch +endpoint: fieldcaps/_field_caps?fields=response,name +json: + index_filter: + bool: + must: + - term: + name: "fritz" + filter: + - range: + response: + gte: 30 +expected: + indices: + - fieldcaps + fields: + response: + long: + type: long + metadata_field: false + searchable: true + aggregatable: true + name: + keyword: + type: keyword + metadata_field: false + searchable: true + aggregatable: true + text: + type: text + metadata_field: false + searchable: true + aggregatable: true +--- +# Test _field_caps API with invalid index_filter +method: [POST] +engines: + - quickwit + - elasticsearch +endpoint: fieldcaps/_field_caps?fields=* +json: + index_filter: + invalid_query_type: + field: "value" +status_code: 400 +--- +# Test _field_caps API with empty index_filter (should return 400 like ES) +method: [POST] +engines: + - quickwit + - elasticsearch +endpoint: fieldcaps/_field_caps?fields=name +json: + index_filter: {} +status_code: 400 +--- +# Test _field_caps API with index_filter using tag field for split pruning (QW-only) +method: [POST] +engines: + - quickwit +endpoint: fieldcaps/_field_caps?fields=name +json: + index_filter: + term: + tags: "nice" +expected: + indices: + - fieldcaps + fields: + name: + keyword: + type: keyword + metadata_field: false + searchable: true + aggregatable: true + text: + type: text + metadata_field: false + searchable: true + aggregatable: true diff --git a/quickwit/rest-api-tests/scenarii/es_field_capabilities/_setup.quickwit.yaml b/quickwit/rest-api-tests/scenarii/es_field_capabilities/_setup.quickwit.yaml index 8b02ee01882..5576e6cec28 100644 --- a/quickwit/rest-api-tests/scenarii/es_field_capabilities/_setup.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/es_field_capabilities/_setup.quickwit.yaml @@ -22,6 +22,7 @@ json: tokenizer: default fast: true timestamp_field: date + tag_fields: ["tags"] field_mappings: - name: date type: datetime @@ -32,6 +33,10 @@ json: - name: host type: ip fast: true + - name: tags + type: array + tokenizer: raw + fast: true --- # Create index method: POST