From 3d8cddd501ce4ffe0c404de698335b8262e88c27 Mon Sep 17 00:00:00 2001
From: Jose Castro <jose.castro@dotcms.com>
Date: Wed, 4 Feb 2026 22:59:43 -0600
Subject: [PATCH] task(content analytics) #34395 : Implement the Engagement SQL
 structures

---
 .../analytics/setup/config/dev/cube/cube.js   |   12 +-
 .../config/dev/cube/schema/EngagementDaily.js |  191 +++
 .../config/dev/cube/schema/EventSummary.js    |    8 +-
 .../dev/cube/schema/SessionsByBrowserDaily.js |  110 ++
 .../dev/cube/schema/SessionsByDeviceDaily.js  |  114 ++
 .../cube/schema/SessionsByLanguageDaily.js    |  111 ++
 .../setup/db/clickhouse/init-scripts/init.sql | 1144 ++++++++++++++++-
 7 files changed, 1679 insertions(+), 11 deletions(-)
 create mode 100644 docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/EngagementDaily.js
 create mode 100644 docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByBrowserDaily.js
 create mode 100644 docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByDeviceDaily.js
 create mode 100644 docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByLanguageDaily.js

diff --git a/docker/docker-compose-examples/analytics/setup/config/dev/cube/cube.js b/docker/docker-compose-examples/analytics/setup/config/dev/cube/cube.js
index d68b5a3b52b8..1c02b39da2a1 100644
--- a/docker/docker-compose-examples/analytics/setup/config/dev/cube/cube.js
+++ b/docker/docker-compose-examples/analytics/setup/config/dev/cube/cube.js
@@ -37,7 +37,17 @@ const fs = require('fs');
 const path = require('path');
 
 // Whitelist of allowed cubes for security validation
-const ALLOWED_CUBES = ['Events', 'request', 'ContentAttribution', 'Conversion', 'EventSummary'];
+const ALLOWED_CUBES = [
+    'Events',
+    'request',
+    'ContentAttribution',
+    'Conversion',
+    'EventSummary',
+    'EngagementDaily',
+    'SessionsByDeviceDaily',
+    'SessionsByBrowserDaily',
+    'SessionsByLanguageDaily'
+];
 
 // Extract customer ID from scope (required for all queries)
 // Note: Scope comes from validated JWT token, so customer_id is already trusted
diff --git a/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/EngagementDaily.js b/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/EngagementDaily.js
new file mode 100644
index 000000000000..46f9779fa048
--- /dev/null
+++ b/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/EngagementDaily.js
@@ -0,0 +1,191 @@
+/**
+ * =====================================================================================
+ * 1) EngagementDaily Cube
+ * =====================================================================================
+ *
+ * Source table:
+ *   clickhouse_test_db.engagement_daily
+ *
+ * Grain:
+ *   One row per (customer_id, cluster_id, context_site_id, day).
+ *
+ * Primary use cases:
+ * - KPI cards:
+ *     - Engagement Rate
+ *     - Avg Interactions (Engaged)
+ *     - Avg Session Time
+ *     - Total Sessions
+ *     - Conversion Rate (Engaged conversions / Total sessions)
+ * - Trend charts:
+ *     - Engaged sessions per day
+ *     - Total sessions per day
+ *
+ * Why this cube is fast:
+ * - The table is already rolled up daily in ClickHouse.
+ * - Queries over date ranges become sums over days, not scans of sessions/events.
+ *
+ * Correctness notes:
+ * - Rates must be computed as ratio-of-sums over the selected period.
+ *   Example:
+ *     engagementRate = sum(engaged_sessions) / sum(total_sessions)
+ *   Do NOT average daily engagement rates.
+ */
+cube(`EngagementDaily`, {
+    sql: `
+    SELECT
+      customer_id,
+      context_site_id,
+      day,
+      total_sessions,
+      engaged_sessions,
+      engaged_conversion_sessions,
+      total_events_all,
+      total_duration_all,
+      total_events_engaged,
+      total_duration_engaged,
+      updated_at
+    FROM clickhouse_test_db.engagement_daily
+  `,
+
+    measures: {
+        // ---------------------------
+        // Base counts (raw sums)
+        //
+        // Base totals (these are additive; safe to SUM over any date range)
+        // ---------------------------
+        totalSessions: {
+            sql: `total_sessions`,
+            type: `sum`,
+            title: `Total Sessions`,
+            description: `Total sessions in the selected period (sum of daily totals).`
+        },
+
+        engagedSessions: {
+            sql: `engaged_sessions`,
+            type: `sum`,
+            title: `Engaged Sessions`,
+            description: `Number of engaged sessions in the selected period.`
+        },
+
+        engagedConversionSessions: {
+            sql: `engaged_conversion_sessions`,
+            type: `sum`,
+            title: `Engaged Sessions With Conversion`,
+            description: `Engaged sessions that contain at least one conversion event.`
+        },
+
+        // ---------------------------
+        // Base sums for averages
+        // ---------------------------
+        totalEventsAll: {
+            sql: `total_events_all`,
+            type: `sum`,
+            title: `Total Events (All Sessions)`,
+            description: `Sum of total events across ALL sessions (engaged + non-engaged).`
+        },
+
+        totalDurationAllSeconds: {
+            sql: `total_duration_all`,
+            type: `sum`,
+            title: `Total Session Duration (Seconds, All Sessions)`,
+            description: `Sum of duration_seconds across ALL sessions.`
+        },
+
+        totalEventsEngaged: {
+            sql: `total_events_engaged`,
+            type: `sum`,
+            title: `Total Events (Engaged Sessions)`,
+            description: `Sum of total events across ENGAGED sessions only.`
+        },
+
+        totalDurationEngagedSeconds: {
+            sql: `total_duration_engaged`,
+            type: `sum`,
+            title: `Total Session Duration (Seconds, Engaged Sessions)`,
+            description: `Sum of duration_seconds across ENGAGED sessions only.`
+        },
+
+        // ---------------------------
+        // KPI Ratios (ratio of sums)
+        // ---------------------------
+        // === Derived KPI measures (computed as ratio of sums; correct for any time range) ===
+        engagementRate: {
+            // engaged sessions / total sessions
+            sql: `sum(engaged_sessions) / nullIf(sum(total_sessions), 0)`,
+            type: `number`,
+            format: `percent`,
+            title: `Engagement Rate`,
+            description: `Engaged Sessions / Total Sessions (ratio of sums over the selected period).`,
+        },
+
+        conversionRate: {
+            // engaged sessions with >=1 conversion / total sessions (your definition)
+            sql: `sum(engaged_conversion_sessions) / nullIf(sum(total_sessions), 0)`,
+            type: `number`,
+            format: `percent`,
+            title: `Conversion Rate`,
+            description: `Engaged sessions with >=1 conversion / Total sessions.`
+        },
+
+        avgInteractionsPerEngagedSession: {
+            // average events per engaged session
+            sql: `sum(total_events_engaged) / nullIf(sum(engaged_sessions), 0)`,
+            type: `number`,
+            title: `Avg Interactions per Engaged Session`,
+            description: `Total events in engaged sessions / engaged sessions.`
+        },
+
+        avgSessionTimeSeconds: {
+            // average session duration (seconds) over all sessions
+            sql: `sum(total_duration_all) / nullIf(sum(total_sessions), 0)`,
+            type: `number`,
+            title: `Avg Engaged Session Time (Seconds)`,
+            description: `Average time per session over ALL sessions: total_duration_all / total_sessions.`
+        },
+
+        avgEngagedSessionTimeSeconds: {
+            // average duration among engaged sessions only (often useful)
+            sql: `sum(total_duration_engaged) / nullIf(sum(engaged_sessions), 0)`,
+            type: `number`,
+            title: `Avg Engaged Session Time (Seconds)`,
+            description: `Average time per ENGAGED session: total_duration_engaged / engaged_sessions.`
+        },
+    },
+
+    dimensions: {
+        customerId: {
+            sql: `customer_id`,
+            type: `string`,
+            title: `Customer Id`,
+            description: `Tenant identifier. Always filter by this in production.`
+        },
+
+        clusterId: {
+            sql: `cluster_id`,
+            type: `string`,
+            title: `Cluster Id`,
+            description: `Environment/cluster identifier (prod/stage/etc.). Filter when needed.`,
+        },
+
+        contextSiteId: {
+            sql: `context_site_id`,
+            type: `string`,
+            title: `Site Id`,
+            description: `dotCMS Site identifier (context_site_id)`
+        },
+
+        day: {
+            sql: `day`,
+            type: `time`,
+            title: `Day`,
+            description: `Day grain used for filtering and trends. Use granularity: day.`
+        },
+
+        updatedAt: {
+            sql: `updated_at`,
+            type: `time`,
+            title: `Updated At`,
+            description: `When this rollup row was last recomputed by the refreshable MV.`
+        },
+    },
+});
diff --git a/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/EventSummary.js b/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/EventSummary.js
index 9354d67f1ac5..83654b59b402 100644
--- a/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/EventSummary.js
+++ b/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/EventSummary.js
@@ -6,10 +6,10 @@ cube(`EventSummary`, {
   sql: `
     SELECT day, cluster_id, customer_id, context_site_id, event_type, context_user_id, identifier, title, daily_total
     FROM content_events_counter
-    WHERE ${FILTER_PARAMS.ContentAttribution.customerId.filter('customer_id')} AND (
-      ${FILTER_PARAMS.ContentAttribution.clusterId ? FILTER_PARAMS.ContentAttribution.clusterId.filter('cluster_id') : '1=1'}
-        OR (${FILTER_PARAMS.ContentAttribution.clusterId ? 'FALSE' : '(cluster_id IS NULL OR cluster_id = \'\')'})) AND
-      ${FILTER_PARAMS.ContentAttribution.day.filter('day')}
+    WHERE ${FILTER_PARAMS.EventSummary.customerId.filter('customer_id')} AND (
+      ${FILTER_PARAMS.EventSummary.clusterId ? FILTER_PARAMS.EventSummary.clusterId.filter('cluster_id') : '1=1'}
+        OR (${FILTER_PARAMS.EventSummary.clusterId ? 'FALSE' : '(cluster_id IS NULL OR cluster_id = \'\')'})) AND
+      ${FILTER_PARAMS.EventSummary.day.filter('day')}
   `,
 
   // 2) Measures
diff --git a/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByBrowserDaily.js b/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByBrowserDaily.js
new file mode 100644
index 000000000000..e99e65b37f2c
--- /dev/null
+++ b/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByBrowserDaily.js
@@ -0,0 +1,110 @@
+/**
+ * =====================================================================================
+ * 3) SessionsByBrowserDaily Cube
+ * =====================================================================================
+ *
+ * Source table:
+ *   clickhouse_test_db.sessions_by_browser_daily
+ *
+ * Grain:
+ *   One row per (customer_id, cluster_id, context_site_id, day, browser_family).
+ *
+ * Widget semantics:
+ * - Engaged sessions count per browser
+ * - Engaged% within browser = engaged_sessions / total_sessions
+ * - Avg engaged time per browser = total_duration_engaged_seconds / engaged_sessions
+ */
+cube(`SessionsByBrowserDaily`, {
+    sql: `
+        SELECT
+            customer_id,
+            context_site_id,
+            day,
+            browser_family,
+            total_sessions,
+            engaged_sessions,
+            total_duration_engaged_seconds,
+            updated_at
+        FROM clickhouse_test_db.sessions_by_browser_daily
+    `,
+
+    measures: {
+        totalSessions: {
+            sql: `total_sessions`,
+            type: `sum`,
+            title: `Total Sessions (All)`,
+            description: `All sessions spawned by this browser family in the selected period.`,
+        },
+
+        engagedSessions: {
+            sql: `engaged_sessions`,
+            type: `sum`,
+            title: `Engaged Sessions`,
+            description: `Engaged sessions spawned by this browser family in the selected period.`,
+        },
+
+        totalDurationEngagedSeconds: {
+            sql: `total_duration_engaged_seconds`,
+            type: `sum`,
+            title: `Total Duration (Engaged, Seconds)`,
+        },
+
+        engagedRateWithinBrowser: {
+            sql: `sum(engaged_sessions) / nullIf(sum(total_sessions), 0)`,
+            type: `number`,
+            format: `percent`,
+            title: `Engaged % (Within Browser)`,
+            description: `Engaged sessions / total sessions within the browser bucket.`,
+        },
+
+        avgEngagedSessionTimeSeconds: {
+            sql: `sum(total_duration_engaged_seconds) / nullIf(sum(engaged_sessions), 0)`,
+            type: `number`,
+            title: `Avg Engaged Session Time (Seconds)`
+        },
+    },
+
+    dimensions: {
+        customerId: {
+            sql: `customer_id`,
+            type: `string`,
+            title: `Customer Id`,
+            description: `Tenant identifier. Always filter by this in production.`
+        },
+
+        clusterId: {
+            sql: `cluster_id`,
+            type: `string`,
+            title: `Cluster Id`,
+            description: `Environment/cluster identifier (prod/stage/etc.). Filter when needed.`,
+        },
+
+        contextSiteId: {
+            sql: `context_site_id`,
+            type: `string`,
+            title: `Site Id`,
+            description: `dotCMS Site identifier (context_site_id)`
+        },
+
+        day: {
+            sql: `day`,
+            type: `time`,
+            title: `Day`,
+            description: `Day grain used for filtering and trends. Use granularity: day.`
+        },
+
+        browserFamily: {
+            sql: `browser_family`,
+            type: `string`,
+            title: `Browser Family`,
+            description: `Normalized bucket: Chrome / Safari / Firefox / Edge / Other.`,
+        },
+
+        updatedAt: {
+            sql: `updated_at`,
+            type: `time`,
+            title: `Updated At`,
+            description: `When this rollup row was last recomputed by the refreshable MV.`
+        },
+    },
+});
diff --git a/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByDeviceDaily.js b/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByDeviceDaily.js
new file mode 100644
index 000000000000..5162ff2c9cab
--- /dev/null
+++ b/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByDeviceDaily.js
@@ -0,0 +1,114 @@
+/**
+ * =====================================================================================
+ * 2) SessionsByDeviceDaily Cube
+ * =====================================================================================
+ *
+ * Source table:
+ *   clickhouse_test_db.sessions_by_device_daily
+ *
+ * Grain:
+ *   One row per (customer_id, cluster_id, context_site_id, day, device_category).
+ *
+ * Widget semantics (matches your screenshots):
+ * - The big number (e.g., 32030) is ENGAGED sessions in that bucket.
+ * - The percentage (e.g., 72%) is:
+ *       engaged_sessions_in_bucket / total_sessions_in_bucket
+ * - Average time is computed ONLY over engaged sessions:
+ *       total_duration_engaged_seconds / engaged_sessions
+ */
+cube(`SessionsByDeviceDaily`, {
+    sql: `
+        SELECT
+            customer_id,
+            context_site_id,
+            day,
+            device_category,
+            total_sessions,
+            engaged_sessions,
+            total_duration_engaged_seconds,
+            updated_at
+        FROM clickhouse_test_db.sessions_by_device_daily
+    `,
+
+    measures: {
+        // Base measures (sums)
+        totalSessions: {
+            sql: `total_sessions`,
+            type: `sum`,
+            title: `Total Sessions (All)`,
+            description: `All sessions spawned by this device category in the selected period.`
+        },
+        engagedSessions: {
+            sql: `engaged_sessions`,
+            type: `sum`,
+            title: `Engaged Sessions`,
+            description: `Engaged sessions spawned by this device category in the selected period.`
+        },
+
+        engagedRateWithinDevice: {
+            // engaged / total within the device bucket (this is your 72%)
+            sql: `sum(engaged_sessions) / nullIf(sum(total_sessions), 0)`,
+            type: `number`,
+            format: `percent`,
+            title: `Engaged % (Within Device)`,
+            description: `Engaged sessions / total sessions within the device category.`
+        },
+
+        avgEngagedSessionTimeSeconds: {
+            // average duration over engaged sessions only
+            sql: `sum(total_duration_engaged_seconds) / nullIf(sum(engaged_sessions), 0)`,
+            type: `number`,
+            title: `Avg Engaged Session Time (Seconds)`,
+            description: `Average engaged session time within the device category.`
+        },
+
+        totalDurationEngagedSeconds: {
+            sql: `total_duration_engaged_seconds`,
+            type: `sum`,
+            title: `Total Duration Engaged (Seconds)`,
+            description: `Sum of engaged session durations for this device category.`
+        },
+    },
+
+    dimensions: {
+        customerId: {
+            sql: `customer_id`,
+            type: `string`,
+            title: `Customer Id`,
+            description: `Tenant identifier. Always filter by this in production.`
+        },
+
+        clusterId: {
+            sql: `cluster_id`,
+            type: `string`,
+            title: `Cluster Id`,
+            description: `Environment/cluster identifier (prod/stage/etc.). Filter when needed.`,
+        },
+
+        contextSiteId: {
+            sql: `context_site_id`,
+            type: `string`,
+            title: `Site Id`,
+            description: `dotCMS Site identifier (context_site_id)`
+        },
+
+        day: {
+            sql: `day`,
+            type: `time`,
+            title: `Day`,
+            description: `Day grain used for filtering and trends. Use granularity: day.`
+        },
+
+        deviceCategory: {
+            sql: `device_category`,
+            type: `string`,
+            description: `Normalized bucket: Desktop / Mobile / Tablet / Other.`
+        },
+        updatedAt: {
+            sql: `updated_at`,
+            type: `time`,
+            title: `Updated At`,
+            description: `When this rollup row was last recomputed by the refreshable MV.`
+        },
+    },
+});
diff --git a/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByLanguageDaily.js b/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByLanguageDaily.js
new file mode 100644
index 000000000000..9924660d872f
--- /dev/null
+++ b/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByLanguageDaily.js
@@ -0,0 +1,111 @@
+/**
+ * =====================================================================================
+ * 4) SessionsByLanguageDaily Cube
+ * =====================================================================================
+ *
+ * Source table:
+ *   clickhouse_test_db.sessions_by_language_daily
+ *
+ * Grain:
+ *   One row per (customer_id, cluster_id, context_site_id, day, language_id).
+ *
+ * Widget semantics:
+ * - Engaged sessions count per language
+ * - Engaged% within language = engaged_sessions / total_sessions
+ * - Avg engaged time per language = total_duration_engaged_seconds / engaged_sessions
+ *
+ * Note:
+ * - language_id is stored as String.
+ * - UI resolves the language name via REST Endpoint/API
+ * */
+ cube(`SessionsByLanguageDaily`, {
+    sql: `
+        SELECT
+            customer_id,
+            context_site_id,
+            day,
+            language_id,
+            total_sessions,
+            engaged_sessions,
+            total_duration_engaged_seconds,
+            updated_at
+        FROM clickhouse_test_db.sessions_by_language_daily
+    `,
+
+    measures: {
+        totalSessions: {
+            sql: `total_sessions`,
+            type: `sum`,
+            title: `Total Sessions (All)`,
+        },
+
+        engagedSessions: {
+            sql: `engaged_sessions`,
+            type: `sum`,
+            title: `Engaged Sessions`,
+        },
+
+        totalDurationEngagedSeconds: {
+            sql: `total_duration_engaged_seconds`,
+            type: `sum`,
+            title: `Total Duration (Engaged, Seconds)`,
+        },
+
+        engagedRateWithinLanguage: {
+            sql: `sum(engaged_sessions) / nullIf(sum(total_sessions), 0)`,
+            type: `number`,
+            format: `percent`,
+            title: `Engaged % (Within Language)`,
+        },
+
+        avgEngagedSessionTimeSeconds: {
+            sql: `sum(total_duration_engaged_seconds) / nullIf(sum(engaged_sessions), 0)`,
+            type: `number`,
+            title: `Avg Engaged Session Time (Seconds)`,
+        }
+    },
+
+    dimensions: {
+        customerId: {
+            sql: `customer_id`,
+            type: `string`,
+            title: `Customer Id`,
+            description: `Tenant identifier. Always filter by this in production.`
+        },
+
+        clusterId: {
+            sql: `cluster_id`,
+            type: `string`,
+            title: `Cluster Id`,
+            description: `Environment/cluster identifier (prod/stage/etc.). Filter when needed.`,
+        },
+
+        contextSiteId: {
+            sql: `context_site_id`,
+            type: `string`,
+            title: `Site Id`,
+            description: `dotCMS Site identifier (context_site_id)`
+        },
+
+        day: {
+            sql: `day`,
+            type: `time`,
+            title: `Day`,
+            description: `Day grain used for filtering and trends. Use granularity: day.`
+        },
+
+        languageId: {
+            sql: `language_id`,
+            type: `string`,
+            title: `Language Id`,
+            description: `dotCMS language id (String). Display name resolved externally.`,
+        },
+
+        updatedAt: {
+            sql: `updated_at`,
+            type: `time`,
+            title: `Updated At`,
+            description: `When this rollup row was last recomputed by the refreshable MV.`
+        },
+    },
+});
diff --git a/docker/docker-compose-examples/analytics/setup/db/clickhouse/init-scripts/init.sql b/docker/docker-compose-examples/analytics/setup/db/clickhouse/init-scripts/init.sql
index 43b5e82be238..c2fcdea8bd4a 100644
--- a/docker/docker-compose-examples/analytics/setup/db/clickhouse/init-scripts/init.sql
+++ b/docker/docker-compose-examples/analytics/setup/db/clickhouse/init-scripts/init.sql
@@ -34,7 +34,7 @@
 -- This is the raw event ingestion table. All data from Jitsu/collectors enters here first.
 -- =====================================================================
 CREATE DATABASE IF NOT EXISTS clickhouse_test_db;
-use clickhouse_test_db;
+USE clickhouse_test_db;
 CREATE TABLE IF NOT EXISTS clickhouse_test_db.events
 (
     -- ######################################################
@@ -237,7 +237,7 @@ CREATE TABLE clickhouse_test_db.content_events_counter
     daily_total UInt64
 )
     ENGINE = SummingMergeTree(daily_total)
-        ORDER BY (customer_id, cluster_id, context_user_id, day, identifier, title, event_type);
+    ORDER BY (customer_id, cluster_id, context_user_id, day, identifier, title, event_type);
 
 
 -- =====================================================================
@@ -303,8 +303,8 @@ CREATE TABLE clickhouse_test_db.conversion_time
     timestamp_last_time  AggregateFunction( max, DateTime64(3, 'UTC'))
 )
     ENGINE = AggregatingMergeTree
-        PARTITION BY (customer_id)
-        ORDER BY (customer_id, cluster_id, context_user_id);
+    PARTITION BY (customer_id)
+    ORDER BY (customer_id, cluster_id, context_user_id);
 
 -- =====================================================================
 -- Tracks which content a user interacted with prior to a conversion and after the user's previous conversion
@@ -332,8 +332,8 @@ CREATE TABLE clickhouse_test_db.content_presents_in_conversion
     events_count UInt32
 )
     ENGINE = SummingMergeTree
-        PARTITION BY (customer_id)
-        ORDER BY (customer_id, cluster_id, context_user_id, event_type, conversion_name, identifier, title, day);
+    PARTITION BY (customer_id)
+    ORDER BY (customer_id, cluster_id, context_user_id, event_type, conversion_name, identifier, title, day);
 
 
 -- =====================================================================
@@ -443,3 +443,1135 @@ SELECT customer_id,
 FROM clickhouse_test_db.content_presents_in_conversion
 GROUP BY customer_id, cluster_id, context_user_id, context_site_id;
 
+
+/* =====================================================================================================
+   Session Engagement Pipeline
+   =====================================================================================================
+
+   This script defines the complete analytics pipeline used to compute GA4-style engagement metrics
+   while remaining dotCMS-specific, scalable, and explainable. The pipeline is layered:
+
+     events (raw immutable events)
+        ↓ (real-time materialized view)
+     session_states (incremental, mergeable session aggregates)
+        ↓ (refreshable MV - finalize only recent sessions)
+     session_facts (one row per session, dashboard-friendly)
+        ↓ (refreshable MVs - daily rollups)
+     engagement_daily + sessions_by_*_daily (small rollups for dashboards)
+        ↓
+     CubeJS reads from rollup tables and exposes metrics via API
+
+    ┌───────────────────────────────────────────────┐
+    │               Browser / Site                  │
+    │                                               │
+    │  pageview | content_click | conversion | ...  │
+    │                                               │
+    └──────────────────────┬────────────────────────┘
+                           │
+                           ▼
+    ┌───────────────────────────────────────────────┐
+    │           events  (MergeTree)                 │
+    │                                               │
+    │  - one row per event                          │
+    │  - sessionid                                  │
+    │  - user agent (device, browser)               │
+    │  - language id                                │
+    │  - timestamp                                  │
+    │                                               │
+    └──────────────────────┬────────────────────────┘
+                           │  (incremental MV)
+                           ▼
+    ┌───────────────────────────────────────────────┐
+    │        session_states (AggregatingMT)         │
+    │                                               │
+    │  - one row per session (event states)         │
+    │  - min/max timestamp                          │
+    │  - pageview count                             │
+    │  - conversion count                           │
+    │  - last-seen device / browser / language      │
+    │                                               │
+    └──────────────────────┬────────────────────────┘
+                           │  (refreshable MV)
+                           ▼
+    ┌───────────────────────────────────────────────┐
+    │        session_facts (ReplacingMT)            │
+    │                                               │
+    │  - one row per session                        │
+    │  - duration_seconds                           │
+    │  - total events                               │
+    │  - engaged (true/false)                       │
+    │  - device_category                            │
+    │  - browser_family                             │
+    │  - language_id                                │
+    │                                               │
+    └────────────┬────────────────────────┬─────────┘
+                 │                        │
+                 │                        │
+                 ▼                        ▼
+    ┌─────────────────────────┐   ┌────────────────────────────┐
+    │  engagement_daily       │   │ sessions_by_*_daily        │
+    │  (daily rollup)         │   │ (device / browser / lang)  │
+    │                         │   │                            │
+    │ - total_sessions        │   │ - total_sessions           │
+    │ - engaged_sessions      │   │ - engaged_sessions         │
+    │ - total_duration_*      │   │ - total_duration_engaged   │
+    │                         │   │                            │
+    └────────────┬────────────┘   └──────────────┬─────────────┘
+                 │                               │
+                 ▼                               ▼
+    ┌────────────────────────────────────────────────────────┐
+    │                        CubeJS                          │
+    │                                                        │
+    │  - KPIs (rates, averages)                              │
+    │  - Trends (per day)                                    │
+    │  - Breakdowns (device / browser / language)            │
+    │                                                        │
+    └───────────────────────────┬────────────────────────────┘
+                                │
+                                ▼
+    ┌───────────────────────────────────────────────┐
+    │              Angular Dashboard                │
+    │                                               │
+    │  - KPI cards (Today vs Yesterday)             │
+    │  - Distribution widgets                       │
+    │  - Trends                                     │
+    │                                               │
+    └───────────────────────────────────────────────┘
+
+   Data rollup = To take many detailed rows and aggregate them into fewer, higher-level rows.
+
+   Key tenant scope:
+     All data is scoped by BOTH:
+       - customer_id  (tenant/customer)
+       - cluster_id   (environment/cluster: prod/stage/etc.)
+     Session identity is:
+       (customer_id, cluster_id, sessionid)
+
+   Why this matters:
+     - data isolation between customers
+     - isolation between environments
+     - allows filtering dashboards by customer+cluster with predictable performance
+
+   -----------------------------------------------------------------------------------------------------
+   Operational Notes
+   -----------------------------------------------------------------------------------------------------
+
+   • Refreshable MVs (REFRESH EVERY ...) are used to periodically recompute recent results. This gives
+     near-real-time dashboards without paying heavy costs on every query.
+
+   • "Late arriving events":
+     We assume some events can arrive late (ingestion delays, client retries, network issues). Therefore:
+       - session_states is updated incrementally in real-time
+       - session_facts is re-finalized for sessions active in a rolling time window (e.g., last 72h)
+       - daily rollups are recomputed for last N days (e.g., 90 days)
+
+   • IMPORTANT:
+     The heuristics for device and browser bucketing should be validated against your UA parser output.
+     You can refine the classification later without changing the pipeline structure.
+
+===================================================================================================== */
+
+/* =====================================================================================================
+   1) SESSION STATES (INCREMENTAL, MERGEABLE)
+   =====================================================================================================
+
+   Object: Table
+   Name: clickhouse_test_db.session_states
+   Engine: AggregatingMergeTree
+   Updated by: session_states_mv (real-time MV on events inserts)
+
+   Purpose:
+     - Incrementally aggregates events into per-session "states" that can be merged efficiently. Initially,
+       you get multiple partial rows for the same session key. However, at query time (or during background
+       merges), ClickHouse merges them, so N partial rows become 1 correct session.
+     - This enables:
+         • real-time incremental ingestion
+         • late arriving event correction (because new states merge in)
+         • avoiding expensive GROUP BY on raw events repeatedly
+         • Inserts that stay fast:
+             • MV does simple aggregation on the batch
+             • no global coordination
+             • no locking
+     - Instead of recomputing sessions from raw events every 15 mins, we continuously maintain aggregate
+       states per session using an MV. This is extremely scalable in ClickHouse because:
+         • inserts are append-only
+         • aggregation states merge efficiently during background merges
+         • your “session table” becomes tiny compared to raw events
+     - The table ENGINE set to 'AggregatingMergeTree' means:
+         • Rows are not final
+         • Multiple partial rows for the same session will exist. ClickHouse will merge them later
+         • Every column must be mergeable
+         • This is why you never query session_states directly for dashboards
+     - A mental analogy: Think of session_states as a shopping cart:
+         • items get added in multiple steps
+         • sometimes items arrive late
+         • the cart is not “final” until checkout
+            -> session_facts is the receipt.
+
+   Key Grain (one logical session):
+     (customer_id, cluster_id, sessionid, context_site_id)
+
+   Column strategy:
+     We store AggregateFunction(...) columns rather than finalized values.
+     Example:
+       - min_ts_state stores minState(_timestamp)
+       - later, minMerge(min_ts_state) finalizes the min timestamp across merges
+
+   Why using argMaxState for dimensions?
+     - Session metadata can be emitted multiple times.
+     - We want a deterministic "latest value" for device/browser/language.
+     - argMaxState(value, _timestamp) keeps the value associated with the greatest timestamp.
+
+===================================================================================================== */
+CREATE TABLE clickhouse_test_db.session_states
+(
+    /* Tenant boundaries */
+    customer_id String,        -- dotCMS customer / tenant identifier
+    cluster_id  String,        -- environment/cluster identifier (prod/stage/etc.)
+    /* We use argMax for context_site_id because session_states is mergeable, and ClickHouse needs an explicit rule
+       to deterministically resolve the value when partial session data is merged. */
+    context_site_id_state AggregateFunction(argMax, String, DateTime64(3, 'UTC')),
+    context_site_id String,
+    /* Session boundary */
+    sessionid   String,        -- unique session identifier; all events with same sessionid belong together
+
+    /* Session time window (mergeable aggregate states) */
+    min_ts_state AggregateFunction(min, DateTime64(3, 'UTC')), -- earliest event timestamp seen in session
+    max_ts_state AggregateFunction(max, DateTime64(3, 'UTC')), -- latest  event timestamp seen in session
+
+    /* Event counters (mergeable) */
+    total_events_state AggregateFunction(count),               -- total events in session
+    pageviews_state    AggregateFunction(countIf, UInt8),      -- number of pageview events in session
+    conversions_state  AggregateFunction(countIf, UInt8),      -- number of conversion events in session
+
+    user_agent_state AggregateFunction(argMax, String, DateTime64(3, 'UTC')),
+
+    /* Dimension "last known value" states (mergeable) */
+    device_category_state AggregateFunction(argMax, String, DateTime64(3, 'UTC')),
+    -- last-seen device category label for the session (Desktop/Mobile/Tablet/Other)
+    -- derived from UA fields; stored as state so that late events can update the final value deterministically.
+
+    browser_family_state  AggregateFunction(argMax, String, DateTime64(3, 'UTC')),
+    -- last-seen browser family bucket (Chrome/Safari/Firefox/Edge/Other)
+
+    language_id_state     AggregateFunction(argMax, String, DateTime64(3, 'UTC'))
+    -- last-seen dotCMS language id (as String), defaulting to '0' if unknown
+)
+    /* Why this engine is mandatory:
+        -> You are storing aggregate states
+        -> You rely on merge correctness
+        -> Without replication, different replicas would compute different session states */
+    /*ENGINE = ReplicatedAggregatingMergeTree(
+            '/clickhouse/tables/{shard}/session_states',
+            '{replica}'
+             )*/
+    ENGINE = AggregatingMergeTree
+    /* Partitioning note:
+       We partition by a hash of (customer, cluster) to spread writes and merges.
+       This avoids a single giant partition for big tenants and keeps merges parallelizable. */
+    PARTITION BY sipHash64(customer_id, cluster_id) % 64
+    /* Note for Sort key:
+       ORDER BY includes tenant + session to keep session states physically clustered for merges/finalization.
+       This also ensures stable grouping keys for session_facts refresh queries. */
+    ORDER BY (
+              customer_id,
+              cluster_id,
+              sessionid,
+              context_site_id);
+
+
+/* =====================================================================================================
+   Device categorization
+   ===================================================================================================== */
+/*
+   Object: Table
+   Name: clickhouse_test_db.device_category_map
+   Engine: MergeTree
+   Used by: session_states_mv (during ingestion)
+
+   This is a tiny dimension table that maps normalized UA parser outputs into a device category bucket. This
+   table is meant to classify using normalized keys we already trust:
+
+    - parsed_ua_device_family (dev_key)
+    - parsed_ua_os_family (os_key)
+
+   Those fields come from the UA parser and are typically already tokenized into a clean family name (e.g., iPhone,
+   iPad, Windows, Mac OS X, etc.). After lowerUTF8(...), those become stable join keys. Think of this table as a
+   trusted translation dictionary:
+
+        • Your UA parser gives you tokens like “iphone”, “ipad”, “windows”.
+        • This table translates those tokens into friendly categories like “Mobile”, “Tablet”, “Desktop”.
+        • It’s clean, deterministic, and fast — like looking up a word in a dictionary.
+ */
+CREATE TABLE IF NOT EXISTS clickhouse_test_db.device_category_map
+(
+    -- What we match on (pick one: device_family, os_family, ua_family, etc.)
+    -- For performance and simplicity, we store normalized lowercase keys.
+    match_key String,          -- e.g. 'iphone', 'android', 'ipad',   'smarttv',  'windows'
+    device_category String     -- e.g. 'Mobile',            'Tablet', 'Smart TV', 'Desktop', 'Other'
+)
+    ENGINE = MergeTree
+    ORDER BY match_key;
+
+/* Initial preload of the most common devices inside their respective categories */
+INSERT INTO clickhouse_test_db.device_category_map VALUES
+    ('smarttv', 'Smart TV'),
+    ('smart tv', 'Smart TV'),
+    ('tizen', 'Smart TV'),
+    ('webos', 'Smart TV'),
+    ('hbbtv', 'Smart TV'),
+    ('roku', 'Smart TV'),
+    ('appletv', 'Smart TV'),
+    ('apple tv', 'Smart TV'),
+
+    ('ipad', 'Tablet'),
+    ('tablet', 'Tablet'),
+    ('kindle', 'Tablet'),
+
+    ('iphone', 'Mobile'),
+    ('ios', 'Mobile'),
+    ('android', 'Mobile'),
+    ('pixel', 'Mobile'),
+    ('samsung', 'Mobile'),
+
+    ('windows', 'Desktop'),
+    ('mac', 'Desktop'),
+    ('macos', 'Desktop'),
+    ('mac os x', 'Desktop'),
+    ('os x', 'Desktop'),
+    ('linux', 'Desktop');
+
+/*
+   Object: Table
+   Name: clickhouse_test_db.device_category_fallback_rules
+   Engine: MergeTree
+   Used by: session_facts_rmv (during session finalization, not ingestion)
+
+   Device fallback rules are set for when the base classification is too generic (“Desktop/Other”) and you want
+   a last-resort rescue by scanning the raw UA string. This table is a small set of priority-ordered fallback
+   heuristics used only when the base classification is too generic, and you still have the raw user-agent string
+   available. This is valuable when:
+
+    - the UA parser doesn’t recognize the device family cleanly; i.e., it fails to identify Smart TVs/consoles cleanly
+    - or collapses unknown stuff into generic families
+    - or you see real-world UAs where the relevant signal is only in the raw string
+ */
+CREATE TABLE IF NOT EXISTS clickhouse_test_db.device_category_fallback_rules
+(
+    priority UInt16,        -- lower = evaluated first
+    category String,        -- e.g. 'Smart TV', 'Tablet', 'Mobile'
+    patterns Array(String)  -- case-insensitive substrings
+)
+    ENGINE = MergeTree
+    ORDER BY priority;
+
+/* ⚠️ Keep this table small. Think “last-resort heuristics”, not full UA parsing. */
+INSERT INTO clickhouse_test_db.device_category_fallback_rules VALUES
+    (10, 'Smart TV', ['smarttv','smart tv','tizen','webos','hbbtv','roku','appletv']),
+    (20, 'Tablet',   ['ipad','tablet','kindle']),
+    (30, 'Mobile',   ['iphone','android','mobile']),
+    (90, 'Desktop',  ['windows','macintosh','linux']),
+    (15, 'Game Console', ['playstation','xbox','nintendo','switch']);
+
+
+/* =====================================================================================================
+   Browser categorization
+   ===================================================================================================== */
+
+/*
+   Object: Table
+   Name: clickhouse_test_db.browser_family_map
+   Engine: MergeTree
+   Used by: session_states_mv (during ingestion)
+
+   This is a small lookup table that maps a normalized UA parser “browser family” token into a stable, dashboard-friendly
+   browser bucket. This table is meant to classify using normalized keys we already trust:
+
+    - parsed_ua_ua_family (browser_key)
+
+   Those fields come from the UA parser and are typically already tokenized into a clean family name (e.g., Safari,
+   Chrome, Firefox, etc.). After lowerUTF8(...), those become stable join keys.
+ */
+CREATE TABLE IF NOT EXISTS clickhouse_test_db.browser_family_map
+(
+    match_key String,          -- normalized lowercase
+    browser_family String      -- 'Chrome','Safari','Firefox','Edge','Other'
+)
+    ENGINE = MergeTree
+    ORDER BY match_key;
+
+/* Initial preload of the most common browsers inside their respective categories */
+INSERT INTO clickhouse_test_db.browser_family_map VALUES
+    ('chrome', 'Chrome'),
+    ('chromium', 'Chrome'),
+    ('headlesschrome', 'Chrome'),
+    ('crios', 'Chrome'),
+    ('safari', 'Safari'),
+    ('mobile safari', 'Safari'),
+    ('firefox', 'Firefox'),
+    ('fxios', 'Firefox'),
+    ('edge', 'Edge'),
+    ('microsoft edge', 'Edge'),
+    ('edg', 'Edge');
+
+/*
+   Object: Table
+   Name: clickhouse_test_db.browser_family_fallback_rules
+   Engine: MergeTree
+   Used by: session_facts_rmv (during session finalization)
+
+   Browser fallback rules. There are used only in `session_facts_rmv`, not during ingestion. They are
+   applied once, at session finalization time. This table defines priority-ordered heuristic rules to
+   classify a browser by scanning the raw UA string, but only when the base classification is too generic.
+   Sometimes the browser family token from the UA parser is unhelpful:
+
+    - it’s missing
+    - it’s normalized into something too generic
+    - or it maps to Other even though the UA string clearly contains a recognizable signature
+
+   So this table lets you “rescue” classification only when needed, without making ingestion heavy.
+ */
+CREATE TABLE IF NOT EXISTS clickhouse_test_db.browser_family_fallback_rules
+(
+    priority UInt16,
+    category String,
+    patterns Array(String)
+)
+    ENGINE = MergeTree
+    ORDER BY priority;
+
+/* ⚠️ Keep this table small. Think “last-resort heuristics”, not full UA parsing. */
+INSERT INTO clickhouse_test_db.browser_family_fallback_rules VALUES
+    (10, 'Edge',    ['edg','edge']),
+    (20, 'Chrome',  ['headlesschrome/','chrome','chromium','crios']),
+    (30, 'Safari',  ['safari']),
+    (40, 'Firefox', ['firefox']),
+    (90, 'Other',   ['opera','ucbrowser']);
+
+
+/* =====================================================================================================
+   1.1) Real-time MV: events → session_states
+   =====================================================================================================
+
+   Object: Materialized View (incremental, real-time)
+   Name: clickhouse_test_db.session_states_mv
+   Pattern: CREATE MATERIALIZED VIEW ... TO session_states AS SELECT ...
+   Source: clickhouse_test_db.events
+   Target: clickhouse_test_db.session_states
+
+   Purpose:
+     - Runs on every insert to events table.
+     - It takes only the NEWLY INSERTED BATCH and turns it into mergeable aggregate-state rows that get
+       appended into session_states.
+     - Writes those states into session_states where they will be merged with prior states.
+     - This MV is the engine that makes the whole pipeline scalable. Without it, you’d be forced to compute
+       sessions by scanning events with a big GROUP BY whenever you need session metrics (slow and expensive
+       at scale).
+
+   Key guarantee:
+     - Correctness over time: if more events arrive for a session later, the session_states row merges in.
+
+   Device bucketing:
+     This is a heuristic based on the UA parsing fields you have:
+       - Tablet if device family suggests ipad/tablet
+       - Mobile if OS suggests iOS/Android or device hints phone/android
+       - Desktop otherwise
+     You can refine this as you validate UA outputs.
+
+   Browser bucketing:
+     - normalizes UA family into a stable set of display-friendly buckets
+     - unrecognized → Other
+
+   Language id:
+     - uses `userlanguage`
+     - replace this expression if language id comes from a different field in your tracking payload
+
+   - Automatically replicated
+   - No engine change required
+   - They write into replicated tables → safe
+
+===================================================================================================== */
+CREATE MATERIALIZED VIEW clickhouse_test_db.session_states_mv
+            TO clickhouse_test_db.session_states
+AS
+/* -------------------------------------------------------------------------------------------------
+   Tables-only enrichment:
+     - device_category_map and browser_family_map are tiny dimension tables in ClickHouse
+     - We LEFT JOIN to them using normalized lowercase keys
+     - We keep the same “last known value” strategy with argMaxState(..., _timestamp)
+------------------------------------------------------------------------------------------------- */
+WITH
+    lowerUTF8(parsed_ua_device_family) AS device_key,
+    lowerUTF8(parsed_ua_os_family)     AS os_key,
+    lowerUTF8(parsed_ua_ua_family)     AS browser_key,
+
+    /* Prefer device_family mapping, else os_family mapping, else Desktop */
+    coalesce(nullIf(d_dev.device_category, ''), nullIf(d_os.device_category, ''), 'Desktop') AS device_category,
+
+    /* Prefer browser mapping, else Other */
+    coalesce(nullIf(b_map.browser_family, ''), 'Other') AS browser_family,
+
+    nullIf(userlanguage, '') AS language_id
+SELECT
+    e.customer_id,
+    e.cluster_id,
+    e.context_site_id,
+    e.sessionid,
+
+    /* Session time window states */
+    minState(e._timestamp) AS min_ts_state,
+    maxState(e._timestamp) AS max_ts_state,
+
+    /* Session-scoped ownership */
+    argMaxState(e.context_site_id, e._timestamp) AS context_site_id_state,
+
+    /* Event counters states */
+    countState() AS total_events_state,
+    countIfState(e.event_type = 'pageview')   AS pageviews_state,
+    countIfState(e.event_type = 'conversion') AS conversions_state,
+
+    /* Session UA stored as state (needed for fallback later) */
+    argMaxState(e.user_agent, e._timestamp) AS user_agent_state,
+
+    /* "last seen" dimension states (tables-only values) */
+    argMaxState(device_category, e._timestamp) AS device_category_state,
+    argMaxState(browser_family, e._timestamp)  AS browser_family_state,
+    argMaxState(coalesce(language_id, '0'), e._timestamp) AS language_id_state
+FROM clickhouse_test_db.events AS e
+     /* Device mapping via table */
+     LEFT JOIN clickhouse_test_db.device_category_map AS d_dev
+        ON d_dev.match_key = device_key
+     LEFT JOIN clickhouse_test_db.device_category_map AS d_os
+        ON d_os.match_key = os_key
+     /* Browser mapping via table */
+     LEFT JOIN clickhouse_test_db.browser_family_map AS b_map
+        ON b_map.match_key = browser_key
+WHERE e.sessionid != ''
+  AND e.customer_id != ''
+  AND e.cluster_id != ''
+  AND e.context_site_id != ''
+GROUP BY (
+    e.customer_id,
+    e.cluster_id,
+    e.context_site_id,
+    e.sessionid);
+
+
+/* =====================================================================================================
+   2) SESSION FACTS (FINALIZED SNAPSHOT)
+   =====================================================================================================
+
+   Object: Table
+   Name: clickhouse_test_db.session_facts
+   Engine: ReplacingMergeTree(updated_at)
+   Written by: session_facts_rmv (refreshable MV that finalizes session_states)
+   Role in pipeline: “finalized session snapshot” — one row per session, query-friendly
+
+   Purpose:
+     - This is the first table in the pipeline that is meant to be read directly by downstream rollups and
+       dashboards.Stores one finalized row per session with plain scalar values (per session aggregates).
+     - This table represents “what happened in a session”, including duration, event counts, and the engaged
+       flag. This gives you a stable, query-friendly “session dimension” with everything you need for engagement
+       metrics.
+     - This is the "source of truth" for session-level analytics and is the input for daily rollups.
+
+   Why using ReplacingMergeTree?
+     - Because we periodically "re-finalize" the same sessions as late events arrive.
+     - Each refresh writes a newer version (updated_at).
+     - ReplacingMergeTree keeps only the latest version during merges.
+
+   IMPORTANT:
+     - When verifying correctness, do NOT rely on FINAL in production queries. Use FINAL only for
+       debugging/sanity checks.
+     - For normal queries, the latest version will be used by merges/reads over time.
+
+===================================================================================================== */
+CREATE TABLE clickhouse_test_db.session_facts
+(
+    /* Tenant scope */
+    customer_id String,
+    cluster_id  String,
+    context_site_id String,
+    /* Session identity */
+    sessionid   String,
+
+    /* Finalized session times */
+    session_start DateTime64(3, 'UTC'), -- earliest event timestamp
+    session_end   DateTime64(3, 'UTC'), -- latest event timestamp
+    duration_seconds UInt32,            -- session_end - session_start (seconds)
+
+    /* Finalized counters */
+    total_events UInt32,                -- total events in session
+    pageviews    UInt32,                -- pageview events
+    conversions  UInt32,                -- conversion events
+
+    /* Engagement flag (GA4-style) */
+    engaged UInt8,                      -- 1 if engaged, else 0
+
+    /* Finalized dimensions */
+    device_category String,             -- Desktop/Mobile/Tablet/Other
+    browser_family  String,             -- Chrome/Safari/Firefox/Edge/Other
+    language_id     String,             -- dotCMS language id as String ('0' unknown)
+
+    /* Row version timestamp for ReplacingMergeTree */
+    updated_at DateTime('UTC')
+)
+    /* Why this matters:
+        -> Late events cause re-finalization
+        -> Multiple versions of the same session will exist temporarily
+        -> Replication guarantees:
+            -> all replicas converge to the same “latest” row
+            -> dashboards don’t disagree depending on which replica is queried */
+    /*ENGINE = ReplicatedReplacingMergeTree(
+            '/clickhouse/tables/{shard}/session_facts',
+            '{replica}',
+            updated_at
+             )*/
+    ENGINE = ReplacingMergeTree
+    /* Partition by month of session_start. Keeps partitions time-bounded and supports TTL strategies
+       later if desired. */
+    PARTITION BY toYYYYMM(toDate(session_start))
+    /* Sort key includes session identity for deterministic replacement. */
+    ORDER BY (
+              customer_id,
+              cluster_id,
+              context_site_id,
+              sessionid);
+
+
+/* =====================================================================================================
+   2.1) Refreshable MV: session_states → session_facts
+   =====================================================================================================
+
+   Object: Refreshable Materialized View (RMV)
+   Name: clickhouse_test_db.session_facts_rmv
+   Type: REFRESH EVERY 15 MINUTE … TO session_facts
+   Source: clickhouse_test_db.session_states
+   Target: clickhouse_test_db.session_facts
+
+   Purpose:
+     - This RMV is the finalizer: it takes “mergeable session state fragments” from session_states and
+       periodically produces (or re-produces) the latest finalized snapshot of each session in session_facts.
+       It’s not triggered by inserts; it runs on a schedule.
+     - Applies the engagement rules to compute engaged = 1 (engaged) or 0 (not engaged).
+     - Recompute “recent sessions” every 15 minutes.
+     - Important design choice: do not recompute “all history” every 15 minutes. Instead, recompute a
+       sliding window that covers late-arriving events + sessions that are still “open”. A very typical
+       window is 48 hours (tune as needed). We're setting 72 hours back for now.
+
+   Rolling window (start_cutoff):
+     - We only finalize sessions whose session_end is in the last 72 hours, which can be increased as long as
+       the REFRESH EVERY value is increased as well.
+     - This bounds cost and still corrects for late arriving events.
+     - For older late events, a nightly "re-finalize last 30 days" job can be added in the future.
+
+   Engagement rules (session is engaged if any):
+     - duration > 10 seconds
+     - pageviews >= 2
+     - conversions >= 1
+
+===================================================================================================== */
+CREATE MATERIALIZED VIEW clickhouse_test_db.session_facts_rmv
+    REFRESH EVERY 15 MINUTE
+    TO clickhouse_test_db.session_facts
+AS
+WITH
+    (now() - INTERVAL 72 HOUR) AS start_cutoff,
+
+    /* Load fallback rules for device and browser once (small arrays, evaluated per session row) */
+    (
+        SELECT arraySort(x -> x.1, groupArray((priority, category, patterns)))
+        FROM clickhouse_test_db.device_category_fallback_rules
+    ) AS device_rules_sorted,
+
+    (
+        SELECT arraySort(x -> x.1, groupArray((priority, category, patterns)))
+        FROM clickhouse_test_db.browser_family_fallback_rules
+    ) AS browser_rules_sorted
+SELECT
+    customer_id,
+    cluster_id,
+    context_site_id,
+    sessionid,
+
+    session_start,
+    session_end,
+    duration_seconds,
+
+    total_events,
+    pageviews,
+    conversions,
+
+    engaged,
+
+    /* FINAL device_category (map-first, UA-pattern fallback second)
+     *
+     * - device_category_base is derived from session_states_mv using table joins:
+     *     device_category_map (device_family/os_family) → category, else 'Desktop'
+     * - We only apply fallback when base is generic ('Desktop' or 'Other') AND we have a UA string.
+     * - Fallback scans ordered rules (priority asc) and returns the first match.
+     * - If no fallback matches, keep base.
+     */
+    if ((device_category_base IN ('Desktop', 'Other')) AND ua_l != '',
+        coalesce(
+                nullIf(
+                        ifNull(
+                                arrayFirst(r -> multiSearchAnyCaseInsensitive(ua_l, r.3) > 0, device_rules_sorted).2,
+                                ''
+                        ),
+                        ''
+                ), device_category_base),
+        -- else
+        device_category_base
+    ) AS device_category,
+
+    /* FINAL browser_family (map-first, UA-pattern fallback second)
+     *
+     * - browser_family_base is derived from session_states_mv using table joins:
+     *     browser_family_map (ua_family) → family, else 'Other'
+     * - We only apply fallback when base resolves to 'Other' AND UA string is available.
+     * - Fallback is priority ordered and returns the first match.
+     * - If no fallback matches, keep base.
+     */
+    if ((browser_family_base = 'Other') AND ua_l != '',
+        coalesce(
+                nullIf(
+                        ifNull(
+                                arrayFirst(r -> multiSearchAnyCaseInsensitive(ua_l, r.3) > 0, browser_rules_sorted).2,
+                                ''
+                        ),
+                        ''
+                ), browser_family_base),
+        -- else
+        browser_family_base
+    ) AS browser_family,
+
+    language_id,
+
+    now() AS updated_at
+FROM
+    (
+        /* Aggregate session_states into finalized scalar columns */
+        SELECT
+            customer_id,
+            cluster_id,
+            context_site_id,
+            sessionid,
+
+            minMerge(min_ts_state) AS session_start,
+            maxMerge(max_ts_state) AS session_end,
+
+            toUInt32(greatest(0, dateDiff('second', session_start, session_end))) AS duration_seconds,
+
+            toUInt32(countMerge(total_events_state)) AS total_events,
+            toUInt32(countIfMerge(pageviews_state))  AS pageviews,
+            toUInt32(countIfMerge(conversions_state)) AS conversions,
+
+            /* Business rules that determines whether a session is flagged as 'engaged' or not */
+            toUInt8(
+                    -- Sessions that last at least 10 seconds
+                    (dateDiff('second', session_start, session_end) > 10)
+                    -- Sessions that trigger at least 2 events of type 'pageview'
+                    OR (countIfMerge(pageviews_state) >= 2)
+                    -- Sessions that trigger at least 1 event of type 'conversion'
+                    OR (countIfMerge(conversions_state) >= 1)
+            ) AS engaged,
+
+            /* Base values now come from table-join mapping in session_states_mv */
+            coalesce(nullIf(argMaxMerge(device_category_state), ''), 'Desktop') AS device_category_base,
+            coalesce(nullIf(argMaxMerge(browser_family_state), ''), 'Other')    AS browser_family_base,
+
+            /* UA for fallback matching */
+            lowerUTF8(argMaxMerge(user_agent_state)) AS ua_l,
+
+            argMaxMerge(language_id_state) AS language_id
+        FROM clickhouse_test_db.session_states
+        GROUP BY (
+                  customer_id,
+                  cluster_id,
+                  context_site_id,
+                  sessionid)
+        HAVING session_end >= start_cutoff
+    ) s;
+
+
+/* =====================================================================================================
+   3) DAILY ENGAGEMENT ROLLUP (DASHBOARD READY)
+   =====================================================================================================
+
+   Object: Table
+   Name: clickhouse_test_db.engagement_daily
+   Engine: ReplacingMergeTree(updated_at)
+   Written by: engagement_daily_rmv (refreshable MV)
+   Read by: CubeJS + dashboard KPI cards / trend charts
+
+   Purpose:
+     - Stores daily aggregates derived from session_facts. It's intentionally small and dashboard-friendly.
+     - This table is meant to serve the dashboard fast (engagement rate, trend, average interactions,
+       average session time, conversion rate) without scanning sessions.
+     - This is the primary table queried by KPI cards and trend charts.
+     - It avoids scanning session_facts for common dashboards.
+
+   Why store daily sums?
+     - For any time range, correct results are computed as:
+         sum(numerator) / sum(denominator)
+       NOT average of daily rates.
+     - Having daily sums allows accurate ratios for arbitrary date ranges. Dashboards frequently query arbitrary
+       date ranges (e.g., “last 28 days”, “Jan 10–Feb 2”). If you store daily rates (like daily engagement%),
+       then combining days correctly becomes tricky (you must weight by denominators). Instead, you store daily
+       numerators and denominators, so any date-range query can compute accurate rates.
+
+   Grain:
+     (customer_id, cluster_id, context_site_id, day)
+
+   Metrics included:
+     - total_sessions
+     - engaged_sessions
+     - engaged_conversion_sessions
+     - total_events_all
+     - total_duration_all
+     - total_events_engaged
+     - total_duration_engaged
+
+===================================================================================================== */
+CREATE TABLE clickhouse_test_db.engagement_daily
+(
+    customer_id String,
+    cluster_id  String,
+    context_site_id String,
+    day Date,
+
+    total_sessions UInt64,                    -- count of all sessions
+    engaged_sessions UInt64,                  -- count of engaged sessions
+    engaged_conversion_sessions UInt64,       -- engaged sessions that include >=1 conversion
+
+    total_events_all UInt64,                  -- sum(total_events) across all sessions
+    total_duration_all UInt64,                -- sum(duration_seconds) across all sessions
+
+    total_events_engaged UInt64,              -- sum(total_events) across engaged sessions only
+    total_duration_engaged UInt64,            -- sum(duration_seconds) across engaged sessions only
+
+    updated_at DateTime('UTC')
+)
+    /* Why replicate rollups?
+        -> Refreshable MVs rewrite rows
+        -> Each replica must agree on the final row version
+        -> Otherwise, engagement rates can differ between replicas */
+    /*ENGINE = ReplicatedReplacingMergeTree(
+            '/clickhouse/tables/{shard}/engagement_daily',
+            '{replica}',
+            updated_at
+             )*/
+    ENGINE = ReplacingMergeTree
+    PARTITION BY toYYYYMM(day)
+    ORDER BY (customer_id, cluster_id, context_site_id, day);
+
+
+/* =====================================================================================================
+   3.1) Refreshable MV: session_facts → engagement_daily
+   =====================================================================================================
+
+   Object: Refreshable Materialized View (RMV)
+   Name: clickhouse_test_db.engagement_daily_rmv
+   Type: REFRESH EVERY 15 MINUTE … TO engagement_daily
+   Source: clickhouse_test_db.session_facts
+   Target: clickhouse_test_db.engagement_daily
+
+   Purpose:
+     - Periodically recompute daily rollups for recent days from the finalized session snapshot table
+       (session_facts) and writes the results into the dashboard-ready rollup table (engagement_daily).
+     - Here we typically roll up a wider window (e.g., last 90 days) because session_facts is already
+       small and cheap to scan, we can afford to recompute aggregates for many days back — like 90 days —
+       every time the MV refreshes (much smaller than raw events).
+     - Cheap because session_facts is far smaller than events.
+
+   Start window (start_day):
+     - Recompute last 90 days by default.
+     - Tune based on retention needs + cost tolerance.
+     - This makes daily rollups correct even if session_facts rows are updated by late events.
+
+===================================================================================================== */
+CREATE MATERIALIZED VIEW clickhouse_test_db.engagement_daily_rmv
+    REFRESH EVERY 15 MINUTE
+    TO clickhouse_test_db.engagement_daily
+AS
+WITH (today() - 90) AS start_day
+SELECT
+    customer_id,
+    cluster_id,
+    context_site_id,
+    toDate(session_start) AS day,
+
+    count() AS total_sessions,
+    countIf(engaged = 1) AS engaged_sessions,
+    countIf(engaged = 1 AND conversions >= 1) AS engaged_conversion_sessions,
+
+    sum(total_events) AS total_events_all,
+    sum(duration_seconds) AS total_duration_all,
+
+    sumIf(total_events, engaged = 1) AS total_events_engaged,
+    sumIf(duration_seconds, engaged = 1) AS total_duration_engaged,
+
+    now() AS updated_at
+FROM clickhouse_test_db.session_facts
+WHERE toDate(session_start) >= start_day
+GROUP BY (
+          customer_id,
+          cluster_id,
+          context_site_id,
+          day);
+
+
+/* =====================================================================================================
+   4) DIMENSION ROLLUPS (DEVICE / BROWSER / LANGUAGE)
+   =====================================================================================================
+
+   These rollups power the "distribution" widgets:
+     - Sessions by device
+     - Sessions by browser
+     - Sessions by language
+
+   For each bucket (e.g., Desktop):
+     - show engaged_sessions (absolute count)
+     - show engaged% within bucket = engaged_sessions / total_sessions
+     - show avg engaged time = total_duration_engaged / engaged_sessions
+
+   Therefore, each rollup stores BOTH:
+     - total_sessions (all sessions in bucket)
+     - engaged_sessions (subset)
+     - total_duration_engaged_seconds (duration sum for engaged subset)
+
+   Grain:
+     (customer_id, cluster_id, context_site_id, day, {bucket_value})
+     {bucket_value} = device, browser, or language ID
+
+===================================================================================================== */
+
+/* -----------------------------------------------------------------------------------------------------
+   4.1) Sessions by Device (daily)
+----------------------------------------------------------------------------------------------------- */
+
+/*
+   Object: Table
+   Name: clickhouse_test_db.sessions_by_device_daily
+   Engine: ReplacingMergeTree(updated_at)
+   Written by: sessions_by_device_daily_rmv
+   Role: daily rollup for the “Sessions by Device” distribution widget
+
+   Mental model:
+   Think of this table as a daily per-site scoreboard by device category.
+   Each row stores raw counts (total sessions, engaged sessions, engaged time)
+   for one device bucket on one day. The table is periodically rewritten from
+   session_facts to correct late-arriving data. Dashboards compute percentages
+   and averages from these raw totals instead of storing precomputed rates.
+*/
+CREATE TABLE clickhouse_test_db.sessions_by_device_daily
+(
+    customer_id String,
+    cluster_id  String,
+    context_site_id String,
+    day Date,
+
+    device_category String,                  -- Desktop/Mobile/Tablet/Other
+
+    total_sessions UInt64,                   -- ALL sessions for this device_category
+    engaged_sessions UInt64,                 -- Engaged sessions for this device_category
+    total_duration_engaged_seconds UInt64,   -- Sum(duration_seconds) for engaged sessions only
+
+    updated_at DateTime('UTC')
+)
+    /*ENGINE = ReplicatedReplacingMergeTree(
+            '/clickhouse/tables/{shard}/sessions_by_device_daily',
+            '{replica}',
+            updated_at
+             )*/
+    ENGINE = ReplacingMergeTree
+    PARTITION BY toYYYYMM(day)
+    ORDER BY (customer_id, cluster_id, context_site_id, day, device_category);
+
+/*
+   Object: RMV
+   Name: clickhouse_test_db.sessions_by_device_daily_rmv
+   Type: REFRESH EVERY 15 MINUTE … TO sessions_by_device_daily
+   Source: session_facts
+   Target: sessions_by_device_daily
+
+   Recomputes a bounded window (this script uses last 90 days via start_day)
+ */
+CREATE MATERIALIZED VIEW clickhouse_test_db.sessions_by_device_daily_rmv
+    REFRESH EVERY 15 MINUTE
+    TO clickhouse_test_db.sessions_by_device_daily
+AS
+WITH (today() - 90) AS start_day
+SELECT
+    customer_id,
+    cluster_id,
+    context_site_id,
+    toDate(session_start) AS day,
+    device_category,
+
+    count() AS total_sessions,
+    countIf(engaged = 1) AS engaged_sessions,
+    sumIf(duration_seconds, engaged = 1) AS total_duration_engaged_seconds,
+
+    now() AS updated_at
+FROM clickhouse_test_db.session_facts
+WHERE toDate(session_start) >= start_day
+GROUP BY (
+          customer_id,
+          cluster_id,
+          context_site_id,
+          day,
+          device_category);
+
+
+/* -----------------------------------------------------------------------------------------------------
+   4.2) Sessions by Browser (daily)
+----------------------------------------------------------------------------------------------------- */
+
+/*
+   Object: Table
+   Name: clickhouse_test_db.sessions_by_browser_daily
+   Engine: ReplacingMergeTree(updated_at)
+   Written by: sessions_by_browser_daily_rmv
+   Role: daily rollup for the “Sessions by Browser” distribution widget
+
+   Mental model:
+   Think of this table as a daily per-site scoreboard by browser.
+   Each row stores raw counts (total sessions, engaged sessions, engaged time)
+   for one browser bucket on one day. The table is periodically rewritten from
+   session_facts to correct late-arriving data. Dashboards compute percentages
+   and averages from these raw totals instead of storing precomputed rates.
+*/
+CREATE TABLE clickhouse_test_db.sessions_by_browser_daily
+(
+    customer_id String,
+    cluster_id  String,
+    context_site_id String,
+    day Date,
+
+    browser_family String,                   -- Chrome/Safari/Firefox/Edge/Other
+
+    total_sessions UInt64,
+    engaged_sessions UInt64,
+    total_duration_engaged_seconds UInt64,
+
+    updated_at DateTime('UTC')
+)
+    /*ENGINE = ReplicatedReplacingMergeTree(
+            '/clickhouse/tables/{shard}/sessions_by_browser_daily',
+            '{replica}',
+            updated_at
+             )*/
+    ENGINE = ReplacingMergeTree
+    PARTITION BY toYYYYMM(day)
+    ORDER BY (customer_id, cluster_id, context_site_id, day, browser_family);
+
+/*
+   Object: RMV
+   Name: clickhouse_test_db.sessions_by_browser_daily_rmv
+   Type: REFRESH EVERY 15 MINUTE … TO sessions_by_browser_daily
+
+   Same pattern as device: bounded window (e.g., last 90 days)
+ */
+CREATE MATERIALIZED VIEW clickhouse_test_db.sessions_by_browser_daily_rmv
+    REFRESH EVERY 15 MINUTE
+    TO clickhouse_test_db.sessions_by_browser_daily
+AS
+WITH (today() - 90) AS start_day
+SELECT
+    customer_id,
+    cluster_id,
+    context_site_id,
+    toDate(session_start) AS day,
+    browser_family,
+
+    count() AS total_sessions,
+    countIf(engaged = 1) AS engaged_sessions,
+    sumIf(duration_seconds, engaged = 1) AS total_duration_engaged_seconds,
+
+    now() AS updated_at
+FROM clickhouse_test_db.session_facts
+WHERE toDate(session_start) >= start_day
+GROUP BY (
+          customer_id,
+          cluster_id,
+          context_site_id,
+          day,
+          browser_family);
+
+
+/* -----------------------------------------------------------------------------------------------------
+   4.3) Sessions by Language (daily)
+----------------------------------------------------------------------------------------------------- */
+
+/*
+   Object: Table
+   Name: clickhouse_test_db.sessions_by_language_daily
+   Engine: ReplacingMergeTree(updated_at)
+   Written by: sessions_by_language_daily_rmv
+   Role: daily rollup for “Sessions by Language” distribution widget
+
+   Mental model:
+   Think of this table as a daily per-site scoreboard by language.
+   Each row stores raw counts (total sessions, engaged sessions, engaged time)
+   for one language bucket on one day. The table is periodically rewritten from
+   session_facts to correct late-arriving data. Dashboards compute percentages
+   and averages from these raw totals instead of storing precomputed rates.
+*/
+CREATE TABLE clickhouse_test_db.sessions_by_language_daily
+(
+    customer_id String,
+    cluster_id  String,
+    context_site_id String,
+    day Date,
+
+    language_id String,                      -- dotCMS language id as String ('0' unknown)
+
+    total_sessions UInt64,
+    engaged_sessions UInt64,
+    total_duration_engaged_seconds UInt64,
+
+    updated_at DateTime('UTC')
+)
+    /*ENGINE = ReplicatedReplacingMergeTree(
+            '/clickhouse/tables/{shard}/sessions_by_language_daily',
+            '{replica}',
+            updated_at
+             )*/
+    ENGINE = ReplacingMergeTree
+    PARTITION BY toYYYYMM(day)
+    ORDER BY (customer_id, cluster_id, context_site_id, day, language_id);
+
+/*
+   Object: RMV
+   Name: clickhouse_test_db.sessions_by_language_daily_rmv
+   Type: REFRESH EVERY 15 MINUTE … TO sessions_by_language_daily
+
+   Same recompute model: scan session_facts for last N days (90)
+ */
+CREATE MATERIALIZED VIEW clickhouse_test_db.sessions_by_language_daily_rmv
+    REFRESH EVERY 15 MINUTE
+    TO clickhouse_test_db.sessions_by_language_daily
+AS
+WITH (today() - 90) AS start_day
+SELECT
+    customer_id,
+    cluster_id,
+    context_site_id,
+    toDate(session_start) AS day,
+    language_id,
+
+    count() AS total_sessions,
+    countIf(engaged = 1) AS engaged_sessions,
+    sumIf(duration_seconds, engaged = 1) AS total_duration_engaged_seconds,
+
+    now() AS updated_at
+FROM clickhouse_test_db.session_facts
+WHERE toDate(session_start) >= start_day
+GROUP BY (
+          customer_id,
+          cluster_id,
+          context_site_id,
+          day,
+          language_id);