diff --git a/quickwit/quickwit-query/src/aggregations.rs b/quickwit/quickwit-query/src/aggregations.rs index 6b7755c4fcc..42675bd8c98 100644 --- a/quickwit/quickwit-query/src/aggregations.rs +++ b/quickwit/quickwit-query/src/aggregations.rs @@ -12,8 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::net::Ipv6Addr; + use rustc_hash::FxHashMap; use serde::{Deserialize, Serialize}; +use tantivy::DateTime; use tantivy::aggregation::Key as TantivyKey; use tantivy::aggregation::agg_result::{ AggregationResult as TantivyAggregationResult, AggregationResults as TantivyAggregationResults, @@ -24,8 +27,10 @@ use tantivy::aggregation::agg_result::{ use tantivy::aggregation::metric::{ ExtendedStats, PercentileValues as TantivyPercentileValues, PercentileValuesVecEntry, PercentilesMetricResult as TantivyPercentilesMetricResult, SingleMetricResult, Stats, - TopHitsMetricResult, + TopHitsMetricResult as TantivyTopHitsMetricResult, TopHitsVecEntry as TantivyTopHitsVecEntry, }; +use tantivy::schema::{Facet, OwnedValue as TantivyOwnedValue}; +use tantivy::tokenizer::PreTokenizedString; // hopefully all From in this module are no-ops, otherwise, this is a very sad situation @@ -116,7 +121,7 @@ impl From for MetricResult { TantivyMetricResult::ExtendedStats(val) => MetricResult::ExtendedStats(val), TantivyMetricResult::Sum(val) => MetricResult::Sum(val), TantivyMetricResult::Percentiles(val) => MetricResult::Percentiles(val.into()), - TantivyMetricResult::TopHits(val) => MetricResult::TopHits(val), + TantivyMetricResult::TopHits(val) => MetricResult::TopHits(val.into()), TantivyMetricResult::Cardinality(val) => MetricResult::Cardinality(val), } } @@ -133,7 +138,7 @@ impl From for TantivyMetricResult { MetricResult::ExtendedStats(val) => TantivyMetricResult::ExtendedStats(val), MetricResult::Sum(val) => TantivyMetricResult::Sum(val), MetricResult::Percentiles(val) => TantivyMetricResult::Percentiles(val.into()), - MetricResult::TopHits(val) => TantivyMetricResult::TopHits(val), + MetricResult::TopHits(val) => TantivyMetricResult::TopHits(val.into()), MetricResult::Cardinality(val) => TantivyMetricResult::Cardinality(val), } } @@ -413,3 +418,149 @@ impl From for TantivyPercentilesMetricResult { TantivyPercentilesMetricResult { values } } } + +// Redefine the tantivy TopHitsVecEntry to use our own `OwnedValue` +// and avoid skip_serializing_if so postcard can (de)-serialize it. +/// The top_hits metric results entry +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct TopHitsVecEntry { + /// The sort values of the document, depending on the sort criteria in the request. + pub sort: Vec>, + + /// Search results, for queries that include field retrieval requests + /// (`docvalue_fields`). + #[serde(rename = "docvalue_fields")] + pub doc_value_fields: FxHashMap, +} + +/// The top_hits metric aggregation results a list of top hits by sort criteria. +/// +/// The main reason for wrapping it in `hits` is to match elasticsearch output structure. +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct TopHitsMetricResult { + /// The result of the top_hits metric. + pub hits: Vec, +} + +/// Redefinition of [`TantivyOwnedValue`] to have it work +/// with postcard de-serialization. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub enum OwnedValue { + /// A null value. + Null, + /// The str type is used for any text information. + Str(String), + /// Pre-tokenized str type, + PreTokStr(PreTokenizedString), + /// Unsigned 64-bits Integer `u64` + U64(u64), + /// Signed 64-bits Integer `i64` + I64(i64), + /// 64-bits Float `f64` + F64(f64), + /// Bool value + Bool(bool), + /// Date/time with nanoseconds precision + Date(DateTime), + /// Facet + Facet(Facet), + /// Arbitrarily sized byte array + Bytes(Vec), + /// A set of values. + Array(Vec), + /// Dynamic object value. + Object(Vec<(String, Self)>), + /// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`. + IpAddr(Ipv6Addr), +} + +impl From for TantivyTopHitsVecEntry { + fn from(value: TopHitsVecEntry) -> TantivyTopHitsVecEntry { + TantivyTopHitsVecEntry { + sort: value.sort, + doc_value_fields: value + .doc_value_fields + .into_iter() + .map(|(k, v)| (k, v.into())) + .collect(), + } + } +} +impl From for TopHitsVecEntry { + fn from(value: TantivyTopHitsVecEntry) -> Self { + TopHitsVecEntry { + sort: value.sort, + doc_value_fields: value + .doc_value_fields + .into_iter() + .map(|(k, v)| (k, v.into())) + .collect(), + } + } +} +impl From for TantivyTopHitsMetricResult { + fn from(value: TopHitsMetricResult) -> Self { + TantivyTopHitsMetricResult { + hits: value + .hits + .into_iter() + .map(TantivyTopHitsVecEntry::from) + .collect(), + } + } +} +impl From for TopHitsMetricResult { + fn from(value: TantivyTopHitsMetricResult) -> Self { + TopHitsMetricResult { + hits: value.hits.into_iter().map(TopHitsVecEntry::from).collect(), + } + } +} + +impl From for OwnedValue { + fn from(value: TantivyOwnedValue) -> Self { + match value { + TantivyOwnedValue::Null => OwnedValue::Null, + TantivyOwnedValue::Str(v) => OwnedValue::Str(v), + TantivyOwnedValue::PreTokStr(v) => OwnedValue::PreTokStr(v), + TantivyOwnedValue::U64(v) => OwnedValue::U64(v), + TantivyOwnedValue::I64(v) => OwnedValue::I64(v), + TantivyOwnedValue::F64(v) => OwnedValue::F64(v), + TantivyOwnedValue::Bool(v) => OwnedValue::Bool(v), + TantivyOwnedValue::Date(v) => OwnedValue::Date(v), + TantivyOwnedValue::Facet(v) => OwnedValue::Facet(v), + TantivyOwnedValue::Bytes(v) => OwnedValue::Bytes(v), + TantivyOwnedValue::Array(v) => { + OwnedValue::Array(v.into_iter().map(OwnedValue::from).collect()) + } + TantivyOwnedValue::Object(v) => { + OwnedValue::Object(v.into_iter().map(|(k, v)| (k, v.into())).collect()) + } + TantivyOwnedValue::IpAddr(v) => OwnedValue::IpAddr(v), + } + } +} + +impl From for TantivyOwnedValue { + fn from(value: OwnedValue) -> Self { + match value { + OwnedValue::Null => TantivyOwnedValue::Null, + OwnedValue::Str(v) => TantivyOwnedValue::Str(v), + OwnedValue::PreTokStr(v) => TantivyOwnedValue::PreTokStr(v), + OwnedValue::F64(v) => TantivyOwnedValue::F64(v), + OwnedValue::U64(v) => TantivyOwnedValue::U64(v), + OwnedValue::I64(v) => TantivyOwnedValue::I64(v), + OwnedValue::Bool(v) => TantivyOwnedValue::Bool(v), + OwnedValue::Date(v) => TantivyOwnedValue::Date(v), + OwnedValue::Facet(v) => TantivyOwnedValue::Facet(v), + OwnedValue::Bytes(v) => TantivyOwnedValue::Bytes(v), + OwnedValue::Array(v) => { + TantivyOwnedValue::Array(v.into_iter().map(TantivyOwnedValue::from).collect()) + } + OwnedValue::Object(v) => { + TantivyOwnedValue::Object(v.into_iter().map(|(k, v)| (k, v.into())).collect()) + } + OwnedValue::IpAddr(v) => TantivyOwnedValue::IpAddr(v), + } + } +} diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/0032-top_hits.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/0032-top_hits.yaml new file mode 100644 index 00000000000..23a99b6b327 --- /dev/null +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/0032-top_hits.yaml @@ -0,0 +1,139 @@ +# Test top_hits aggregation grouped by actor.login +params: + size: 0 +json: + aggs: + logins: + terms: + field: actor.login + order: + _key: asc + aggs: + recent_events: + top_hits: + size: 1 + sort: + - created_at: desc + docvalue_fields: + - created_at + - repo.name + +expected: + hits: + total: + value: 100 + hits: [] + + aggregations: + logins: + sum_other_doc_count: 90 + buckets: + - key: 1maria + doc_count: 1 + recent_events: + hits: + hits: + - sort: + $expect: "len(val) == 1" + docvalue_fields: + repo.name: ["Health-Tracker/health_tracker"] + created_at: ["2015-02-01T00:00:11Z"] + + - key: aborruso + doc_count: 1 + recent_events: + hits: + hits: + - sort: + $expect: "len(val) == 1" + docvalue_fields: + repo.name: ["SiciliaHub/albopretoriopa"] + created_at: ["2015-02-01T00:00:07Z"] + + - key: adius + doc_count: 1 + recent_events: + hits: + hits: + - sort: + $expect: "len(val) == 1" + docvalue_fields: + repo.name: ["casertap/UkuleleSongbook"] + created_at: ["2015-02-01T00:00:11Z"] + + - key: amosnier + doc_count: 1 + recent_events: + hits: + hits: + - sort: + $expect: "len(val) == 1" + docvalue_fields: + repo.name: ["amosnier/code"] + created_at: ["2015-02-01T00:00:06Z"] + + - key: anantax13 + doc_count: 1 + recent_events: + hits: + hits: + - sort: + $expect: "len(val) == 1" + docvalue_fields: + repo.name: ["anantax13/aboutme"] + created_at: ["2015-02-01T00:00:11Z"] + + - key: ancurio + doc_count: 1 + recent_events: + hits: + hits: + - sort: + $expect: "len(val) == 1" + docvalue_fields: + repo.name: ["Ancurio/mkxp"] + created_at: ["2015-02-01T00:00:15Z"] + + - key: athal7 + doc_count: 1 + recent_events: + hits: + hits: + - sort: + $expect: "len(val) == 1" + docvalue_fields: + repo.name: ["spree/spree"] + created_at: ["2015-02-01T00:00:15Z"] + + - key: basuco + doc_count: 1 + recent_events: + hits: + hits: + - sort: + $expect: "len(val) == 1" + docvalue_fields: + repo.name: ["carlops/Haskinator"] + created_at: ["2015-02-01T00:00:06Z"] + + - key: boecko + doc_count: 1 + recent_events: + hits: + hits: + - sort: + $expect: "len(val) == 1" + docvalue_fields: + repo.name: ["toy/blueutil"] + created_at: ["2015-02-01T00:00:12Z"] + + - key: bptripp + doc_count: 1 + recent_events: + hits: + hits: + - sort: + $expect: "len(val) == 1" + docvalue_fields: + repo.name: ["bptripp/nengo-FPGA"] + created_at: ["2015-02-01T00:00:11Z"]