From 3d8cddd501ce4ffe0c404de698335b8262e88c27 Mon Sep 17 00:00:00 2001 From: Jose Castro Date: Wed, 4 Feb 2026 22:59:43 -0600 Subject: [PATCH] task(content analytics) #34395 : Implement the Engagement SQL structures --- .../analytics/setup/config/dev/cube/cube.js | 12 +- .../config/dev/cube/schema/EngagementDaily.js | 191 +++ .../config/dev/cube/schema/EventSummary.js | 8 +- .../dev/cube/schema/SessionsByBrowserDaily.js | 110 ++ .../dev/cube/schema/SessionsByDeviceDaily.js | 114 ++ .../cube/schema/SessionsByLanguageDaily.js | 111 ++ .../setup/db/clickhouse/init-scripts/init.sql | 1144 ++++++++++++++++- 7 files changed, 1679 insertions(+), 11 deletions(-) create mode 100644 docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/EngagementDaily.js create mode 100644 docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByBrowserDaily.js create mode 100644 docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByDeviceDaily.js create mode 100644 docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByLanguageDaily.js diff --git a/docker/docker-compose-examples/analytics/setup/config/dev/cube/cube.js b/docker/docker-compose-examples/analytics/setup/config/dev/cube/cube.js index d68b5a3b52b8..1c02b39da2a1 100644 --- a/docker/docker-compose-examples/analytics/setup/config/dev/cube/cube.js +++ b/docker/docker-compose-examples/analytics/setup/config/dev/cube/cube.js @@ -37,7 +37,17 @@ const fs = require('fs'); const path = require('path'); // Whitelist of allowed cubes for security validation -const ALLOWED_CUBES = ['Events', 'request', 'ContentAttribution', 'Conversion', 'EventSummary']; +const ALLOWED_CUBES = [ + 'Events', + 'request', + 'ContentAttribution', + 'Conversion', + 'EventSummary', + 'EngagementDaily', + 'SessionsByDeviceDaily', + 'SessionsByBrowserDaily', + 'SessionsByLanguageDaily' +]; // Extract customer ID from scope (required for all queries) // Note: Scope comes from validated JWT token, so customer_id is already trusted diff --git a/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/EngagementDaily.js b/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/EngagementDaily.js new file mode 100644 index 000000000000..46f9779fa048 --- /dev/null +++ b/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/EngagementDaily.js @@ -0,0 +1,191 @@ +/** + * ===================================================================================== + * 1) EngagementDaily Cube + * ===================================================================================== + * + * Source table: + * clickhouse_test_db.engagement_daily + * + * Grain: + * One row per (customer_id, cluster_id, context_site_id, day). + * + * Primary use cases: + * - KPI cards: + * - Engagement Rate + * - Avg Interactions (Engaged) + * - Avg Session Time + * - Total Sessions + * - Conversion Rate (Engaged conversions / Total sessions) + * - Trend charts: + * - Engaged sessions per day + * - Total sessions per day + * + * Why this cube is fast: + * - The table is already rolled up daily in ClickHouse. + * - Queries over date ranges become sums over days, not scans of sessions/events. + * + * Correctness notes: + * - Rates must be computed as ratio-of-sums over the selected period. + * Example: + * engagementRate = sum(engaged_sessions) / sum(total_sessions) + * Do NOT average daily engagement rates. + */ +cube(`EngagementDaily`, { + sql: ` + SELECT + customer_id, + context_site_id, + day, + total_sessions, + engaged_sessions, + engaged_conversion_sessions, + total_events_all, + total_duration_all, + total_events_engaged, + total_duration_engaged, + updated_at + FROM clickhouse_test_db.engagement_daily + `, + + measures: { + // --------------------------- + // Base counts (raw sums) + // + // Base totals (these are additive; safe to SUM over any date range) + // --------------------------- + totalSessions: { + sql: `total_sessions`, + type: `sum`, + title: `Total Sessions`, + description: `Total sessions in the selected period (sum of daily totals).` + }, + + engagedSessions: { + sql: `engaged_sessions`, + type: `sum`, + title: `Engaged Sessions`, + description: `Number of engaged sessions in the selected period.` + }, + + engagedConversionSessions: { + sql: `engaged_conversion_sessions`, + type: `sum`, + title: `Engaged Sessions With Conversion`, + description: `Engaged sessions that contain at least one conversion event.` + }, + + // --------------------------- + // Base sums for averages + // --------------------------- + totalEventsAll: { + sql: `total_events_all`, + type: `sum`, + title: `Total Events (All Sessions)`, + description: `Sum of total events across ALL sessions (engaged + non-engaged).` + }, + + totalDurationAllSeconds: { + sql: `total_duration_all`, + type: `sum`, + title: `Total Session Duration (Seconds, All Sessions)`, + description: `Sum of duration_seconds across ALL sessions.` + }, + + totalEventsEngaged: { + sql: `total_events_engaged`, + type: `sum`, + title: `Total Events (Engaged Sessions)`, + description: `Sum of total events across ENGAGED sessions only.` + }, + + totalDurationEngagedSeconds: { + sql: `total_duration_engaged`, + type: `sum`, + title: `Total Session Duration (Seconds, Engaged Sessions)`, + description: `Sum of duration_seconds across ENGAGED sessions only.` + }, + + // --------------------------- + // KPI Ratios (ratio of sums) + // --------------------------- + // === Derived KPI measures (computed as ratio of sums; correct for any time range) === + engagementRate: { + // engaged sessions / total sessions + sql: `sum(engaged_sessions) / nullIf(sum(total_sessions), 0)`, + type: `number`, + format: `percent`, + title: `Engagement Rate`, + description: `Engaged Sessions / Total Sessions (ratio of sums over the selected period).`, + }, + + conversionRate: { + // engaged sessions with >=1 conversion / total sessions (your definition) + sql: `sum(engaged_conversion_sessions) / nullIf(sum(total_sessions), 0)`, + type: `number`, + format: `percent`, + title: `Conversion Rate`, + description: `Engaged sessions with >=1 conversion / Total sessions.` + }, + + avgInteractionsPerEngagedSession: { + // average events per engaged session + sql: `sum(total_events_engaged) / nullIf(sum(engaged_sessions), 0)`, + type: `number`, + title: `Avg Interactions per Engaged Session`, + description: `Total events in engaged sessions / engaged sessions.` + }, + + avgSessionTimeSeconds: { + // average session duration (seconds) over all sessions + sql: `sum(total_duration_all) / nullIf(sum(total_sessions), 0)`, + type: `number`, + title: `Avg Engaged Session Time (Seconds)`, + description: `Average time per session over ALL sessions: total_duration_all / total_sessions.` + }, + + avgEngagedSessionTimeSeconds: { + // average duration among engaged sessions only (often useful) + sql: `sum(total_duration_engaged) / nullIf(sum(engaged_sessions), 0)`, + type: `number`, + title: `Avg Engaged Session Time (Seconds)`, + description: `Average time per ENGAGED session: total_duration_engaged / engaged_sessions.` + }, + }, + + dimensions: { + customerId: { + sql: `customer_id`, + type: `string`, + title: `Customer Id`, + description: `Tenant identifier. Always filter by this in production.` + }, + + clusterId: { + sql: `cluster_id`, + type: `string`, + title: `Cluster Id`, + description: `Environment/cluster identifier (prod/stage/etc.). Filter when needed.`, + }, + + contextSiteId: { + sql: `context_site_id`, + type: `string`, + title: `Site Id`, + description: `dotCMS Site identifier (context_site_id)` + }, + + day: { + sql: `day`, + type: `time`, + title: `Day`, + description: `Day grain used for filtering and trends. Use granularity: day.` + }, + + updatedAt: { + sql: `updated_at`, + type: `time`, + title: `Updated At`, + description: `When this rollup row was last recomputed by the refreshable MV.` + }, + }, +}); diff --git a/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/EventSummary.js b/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/EventSummary.js index 9354d67f1ac5..83654b59b402 100644 --- a/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/EventSummary.js +++ b/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/EventSummary.js @@ -6,10 +6,10 @@ cube(`EventSummary`, { sql: ` SELECT day, cluster_id, customer_id, context_site_id, event_type, context_user_id, identifier, title, daily_total FROM content_events_counter - WHERE ${FILTER_PARAMS.ContentAttribution.customerId.filter('customer_id')} AND ( - ${FILTER_PARAMS.ContentAttribution.clusterId ? FILTER_PARAMS.ContentAttribution.clusterId.filter('cluster_id') : '1=1'} - OR (${FILTER_PARAMS.ContentAttribution.clusterId ? 'FALSE' : '(cluster_id IS NULL OR cluster_id = \'\')'})) AND - ${FILTER_PARAMS.ContentAttribution.day.filter('day')} + WHERE ${FILTER_PARAMS.EventSummary.customerId.filter('customer_id')} AND ( + ${FILTER_PARAMS.EventSummary.clusterId ? FILTER_PARAMS.EventSummary.clusterId.filter('cluster_id') : '1=1'} + OR (${FILTER_PARAMS.EventSummary.clusterId ? 'FALSE' : '(cluster_id IS NULL OR cluster_id = \'\')'})) AND + ${FILTER_PARAMS.EventSummary.day.filter('day')} `, // 2) Measures diff --git a/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByBrowserDaily.js b/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByBrowserDaily.js new file mode 100644 index 000000000000..e99e65b37f2c --- /dev/null +++ b/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByBrowserDaily.js @@ -0,0 +1,110 @@ +/** + * ===================================================================================== + * 3) SessionsByBrowserDaily Cube + * ===================================================================================== + * + * Source table: + * clickhouse_test_db.sessions_by_browser_daily + * + * Grain: + * One row per (customer_id, cluster_id, context_site_id, day, browser_family). + * + * Widget semantics: + * - Engaged sessions count per browser + * - Engaged% within browser = engaged_sessions / total_sessions + * - Avg engaged time per browser = total_duration_engaged_seconds / engaged_sessions + */ +cube(`SessionsByBrowserDaily`, { + sql: ` + SELECT + customer_id, + context_site_id, + day, + browser_family, + total_sessions, + engaged_sessions, + total_duration_engaged_seconds, + updated_at + FROM clickhouse_test_db.sessions_by_browser_daily + `, + + measures: { + totalSessions: { + sql: `total_sessions`, + type: `sum`, + title: `Total Sessions (All)`, + description: `All sessions spawned by this browser family in the selected period.`, + }, + + engagedSessions: { + sql: `engaged_sessions`, + type: `sum`, + title: `Engaged Sessions`, + description: `Engaged sessions spawned by this browser family in the selected period.`, + }, + + totalDurationEngagedSeconds: { + sql: `total_duration_engaged_seconds`, + type: `sum`, + title: `Total Duration (Engaged, Seconds)`, + }, + + engagedRateWithinBrowser: { + sql: `sum(engaged_sessions) / nullIf(sum(total_sessions), 0)`, + type: `number`, + format: `percent`, + title: `Engaged % (Within Browser)`, + description: `Engaged sessions / total sessions within the browser bucket.`, + }, + + avgEngagedSessionTimeSeconds: { + sql: `sum(total_duration_engaged_seconds) / nullIf(sum(engaged_sessions), 0)`, + type: `number`, + title: `Avg Engaged Session Time (Seconds)` + }, + }, + + dimensions: { + customerId: { + sql: `customer_id`, + type: `string`, + title: `Customer Id`, + description: `Tenant identifier. Always filter by this in production.` + }, + + clusterId: { + sql: `cluster_id`, + type: `string`, + title: `Cluster Id`, + description: `Environment/cluster identifier (prod/stage/etc.). Filter when needed.`, + }, + + contextSiteId: { + sql: `context_site_id`, + type: `string`, + title: `Site Id`, + description: `dotCMS Site identifier (context_site_id)` + }, + + day: { + sql: `day`, + type: `time`, + title: `Day`, + description: `Day grain used for filtering and trends. Use granularity: day.` + }, + + browserFamily: { + sql: `browser_family`, + type: `string`, + title: `Browser Family`, + description: `Normalized bucket: Chrome / Safari / Firefox / Edge / Other.`, + }, + + updatedAt: { + sql: `updated_at`, + type: `time`, + title: `Updated At`, + description: `When this rollup row was last recomputed by the refreshable MV.` + }, + }, +}); diff --git a/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByDeviceDaily.js b/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByDeviceDaily.js new file mode 100644 index 000000000000..5162ff2c9cab --- /dev/null +++ b/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByDeviceDaily.js @@ -0,0 +1,114 @@ +/** + * ===================================================================================== + * 2) SessionsByDeviceDaily Cube + * ===================================================================================== + * + * Source table: + * clickhouse_test_db.sessions_by_device_daily + * + * Grain: + * One row per (customer_id, cluster_id, context_site_id, day, device_category). + * + * Widget semantics (matches your screenshots): + * - The big number (e.g., 32030) is ENGAGED sessions in that bucket. + * - The percentage (e.g., 72%) is: + * engaged_sessions_in_bucket / total_sessions_in_bucket + * - Average time is computed ONLY over engaged sessions: + * total_duration_engaged_seconds / engaged_sessions + */ +cube(`SessionsByDeviceDaily`, { + sql: ` + SELECT + customer_id, + context_site_id, + day, + device_category, + total_sessions, + engaged_sessions, + total_duration_engaged_seconds, + updated_at + FROM clickhouse_test_db.sessions_by_device_daily + `, + + measures: { + // Base measures (sums) + totalSessions: { + sql: `total_sessions`, + type: `sum`, + title: `Total Sessions (All)`, + description: `All sessions spawned by this device category in the selected period.` + }, + engagedSessions: { + sql: `engaged_sessions`, + type: `sum`, + title: `Engaged Sessions`, + description: `Engaged sessions spawned by this device category in the selected period.` + }, + + engagedRateWithinDevice: { + // engaged / total within the device bucket (this is your 72%) + sql: `sum(engaged_sessions) / nullIf(sum(total_sessions), 0)`, + type: `number`, + format: `percent`, + title: `Engaged % (Within Device)`, + description: `Engaged sessions / total sessions within the device category.` + }, + + avgEngagedSessionTimeSeconds: { + // average duration over engaged sessions only + sql: `sum(total_duration_engaged_seconds) / nullIf(sum(engaged_sessions), 0)`, + type: `number`, + title: `Avg Engaged Session Time (Seconds)`, + description: `Average engaged session time within the device category.` + }, + + totalDurationEngagedSeconds: { + sql: `total_duration_engaged_seconds`, + type: `sum`, + title: `Total Duration Engaged (Seconds)`, + description: `Sum of engaged session durations for this device category.` + }, + }, + + dimensions: { + customerId: { + sql: `customer_id`, + type: `string`, + title: `Customer Id`, + description: `Tenant identifier. Always filter by this in production.` + }, + + clusterId: { + sql: `cluster_id`, + type: `string`, + title: `Cluster Id`, + description: `Environment/cluster identifier (prod/stage/etc.). Filter when needed.`, + }, + + contextSiteId: { + sql: `context_site_id`, + type: `string`, + title: `Site Id`, + description: `dotCMS Site identifier (context_site_id)` + }, + + day: { + sql: `day`, + type: `time`, + title: `Day`, + description: `Day grain used for filtering and trends. Use granularity: day.` + }, + + deviceCategory: { + sql: `device_category`, + type: `string`, + description: `Normalized bucket: Desktop / Mobile / Tablet / Other.` + }, + updatedAt: { + sql: `updated_at`, + type: `time`, + title: `Updated At`, + description: `When this rollup row was last recomputed by the refreshable MV.` + }, + }, +}); diff --git a/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByLanguageDaily.js b/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByLanguageDaily.js new file mode 100644 index 000000000000..9924660d872f --- /dev/null +++ b/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByLanguageDaily.js @@ -0,0 +1,111 @@ +/** + * ===================================================================================== + * 4) SessionsByLanguageDaily Cube + * ===================================================================================== + * + * Source table: + * clickhouse_test_db.sessions_by_language_daily + * + * Grain: + * One row per (customer_id, cluster_id, context_site_id, day, language_id). + * + * Widget semantics: + * - Engaged sessions count per language + * - Engaged% within language = engaged_sessions / total_sessions + * - Avg engaged time per language = total_duration_engaged_seconds / engaged_sessions + * + * Note: + * - language_id is stored as String. + * - UI resolves the language name via REST Endpoint/API + * */ + cube(`SessionsByLanguageDaily`, { + sql: ` + SELECT + customer_id, + context_site_id, + day, + language_id, + total_sessions, + engaged_sessions, + total_duration_engaged_seconds, + updated_at + FROM clickhouse_test_db.sessions_by_language_daily + `, + + measures: { + totalSessions: { + sql: `total_sessions`, + type: `sum`, + title: `Total Sessions (All)`, + }, + + engagedSessions: { + sql: `engaged_sessions`, + type: `sum`, + title: `Engaged Sessions`, + }, + + totalDurationEngagedSeconds: { + sql: `total_duration_engaged_seconds`, + type: `sum`, + title: `Total Duration (Engaged, Seconds)`, + }, + + engagedRateWithinLanguage: { + sql: `sum(engaged_sessions) / nullIf(sum(total_sessions), 0)`, + type: `number`, + format: `percent`, + title: `Engaged % (Within Language)`, + }, + + avgEngagedSessionTimeSeconds: { + sql: `sum(total_duration_engaged_seconds) / nullIf(sum(engaged_sessions), 0)`, + type: `number`, + title: `Avg Engaged Session Time (Seconds)`, + } + }, + + dimensions: { + customerId: { + sql: `customer_id`, + type: `string`, + title: `Customer Id`, + description: `Tenant identifier. Always filter by this in production.` + }, + + clusterId: { + sql: `cluster_id`, + type: `string`, + title: `Cluster Id`, + description: `Environment/cluster identifier (prod/stage/etc.). Filter when needed.`, + }, + + contextSiteId: { + sql: `context_site_id`, + type: `string`, + title: `Site Id`, + description: `dotCMS Site identifier (context_site_id)` + }, + + day: { + sql: `day`, + type: `time`, + title: `Day`, + description: `Day grain used for filtering and trends. Use granularity: day.` + }, + + languageId: { + sql: `language_id`, + type: `string`, + title: `Language Id`, + description: `dotCMS language id (String). Display name resolved externally.`, + }, + + updatedAt: { + sql: `updated_at`, + type: `time`, + title: `Updated At`, + description: `When this rollup row was last recomputed by the refreshable MV.` + }, + }, +}); diff --git a/docker/docker-compose-examples/analytics/setup/db/clickhouse/init-scripts/init.sql b/docker/docker-compose-examples/analytics/setup/db/clickhouse/init-scripts/init.sql index 43b5e82be238..c2fcdea8bd4a 100644 --- a/docker/docker-compose-examples/analytics/setup/db/clickhouse/init-scripts/init.sql +++ b/docker/docker-compose-examples/analytics/setup/db/clickhouse/init-scripts/init.sql @@ -34,7 +34,7 @@ -- This is the raw event ingestion table. All data from Jitsu/collectors enters here first. -- ===================================================================== CREATE DATABASE IF NOT EXISTS clickhouse_test_db; -use clickhouse_test_db; +USE clickhouse_test_db; CREATE TABLE IF NOT EXISTS clickhouse_test_db.events ( -- ###################################################### @@ -237,7 +237,7 @@ CREATE TABLE clickhouse_test_db.content_events_counter daily_total UInt64 ) ENGINE = SummingMergeTree(daily_total) - ORDER BY (customer_id, cluster_id, context_user_id, day, identifier, title, event_type); + ORDER BY (customer_id, cluster_id, context_user_id, day, identifier, title, event_type); -- ===================================================================== @@ -303,8 +303,8 @@ CREATE TABLE clickhouse_test_db.conversion_time timestamp_last_time AggregateFunction( max, DateTime64(3, 'UTC')) ) ENGINE = AggregatingMergeTree - PARTITION BY (customer_id) - ORDER BY (customer_id, cluster_id, context_user_id); + PARTITION BY (customer_id) + ORDER BY (customer_id, cluster_id, context_user_id); -- ===================================================================== -- Tracks which content a user interacted with prior to a conversion and after the user's previous conversion @@ -332,8 +332,8 @@ CREATE TABLE clickhouse_test_db.content_presents_in_conversion events_count UInt32 ) ENGINE = SummingMergeTree - PARTITION BY (customer_id) - ORDER BY (customer_id, cluster_id, context_user_id, event_type, conversion_name, identifier, title, day); + PARTITION BY (customer_id) + ORDER BY (customer_id, cluster_id, context_user_id, event_type, conversion_name, identifier, title, day); -- ===================================================================== @@ -443,3 +443,1135 @@ SELECT customer_id, FROM clickhouse_test_db.content_presents_in_conversion GROUP BY customer_id, cluster_id, context_user_id, context_site_id; + +/* ===================================================================================================== + Session Engagement Pipeline + ===================================================================================================== + + This script defines the complete analytics pipeline used to compute GA4-style engagement metrics + while remaining dotCMS-specific, scalable, and explainable. The pipeline is layered: + + events (raw immutable events) + ↓ (real-time materialized view) + session_states (incremental, mergeable session aggregates) + ↓ (refreshable MV - finalize only recent sessions) + session_facts (one row per session, dashboard-friendly) + ↓ (refreshable MVs - daily rollups) + engagement_daily + sessions_by_*_daily (small rollups for dashboards) + ↓ + CubeJS reads from rollup tables and exposes metrics via API + + ┌───────────────────────────────────────────────┐ + │ Browser / Site │ + │ │ + │ pageview | content_click | conversion | ... │ + │ │ + └──────────────────────┬────────────────────────┘ + │ + ▼ + ┌───────────────────────────────────────────────┐ + │ events (MergeTree) │ + │ │ + │ - one row per event │ + │ - sessionid │ + │ - user agent (device, browser) │ + │ - language id │ + │ - timestamp │ + │ │ + └──────────────────────┬────────────────────────┘ + │ (incremental MV) + ▼ + ┌───────────────────────────────────────────────┐ + │ session_states (AggregatingMT) │ + │ │ + │ - one row per session (event states) │ + │ - min/max timestamp │ + │ - pageview count │ + │ - conversion count │ + │ - last-seen device / browser / language │ + │ │ + └──────────────────────┬────────────────────────┘ + │ (refreshable MV) + ▼ + ┌───────────────────────────────────────────────┐ + │ session_facts (ReplacingMT) │ + │ │ + │ - one row per session │ + │ - duration_seconds │ + │ - total events │ + │ - engaged (true/false) │ + │ - device_category │ + │ - browser_family │ + │ - language_id │ + │ │ + └────────────┬────────────────────────┬─────────┘ + │ │ + │ │ + ▼ ▼ + ┌─────────────────────────┐ ┌────────────────────────────┐ + │ engagement_daily │ │ sessions_by_*_daily │ + │ (daily rollup) │ │ (device / browser / lang) │ + │ │ │ │ + │ - total_sessions │ │ - total_sessions │ + │ - engaged_sessions │ │ - engaged_sessions │ + │ - total_duration_* │ │ - total_duration_engaged │ + │ │ │ │ + └────────────┬────────────┘ └──────────────┬─────────────┘ + │ │ + ▼ ▼ + ┌────────────────────────────────────────────────────────┐ + │ CubeJS │ + │ │ + │ - KPIs (rates, averages) │ + │ - Trends (per day) │ + │ - Breakdowns (device / browser / language) │ + │ │ + └───────────────────────────┬────────────────────────────┘ + │ + ▼ + ┌───────────────────────────────────────────────┐ + │ Angular Dashboard │ + │ │ + │ - KPI cards (Today vs Yesterday) │ + │ - Distribution widgets │ + │ - Trends │ + │ │ + └───────────────────────────────────────────────┘ + + Data rollup = To take many detailed rows and aggregate them into fewer, higher-level rows. + + Key tenant scope: + All data is scoped by BOTH: + - customer_id (tenant/customer) + - cluster_id (environment/cluster: prod/stage/etc.) + Session identity is: + (customer_id, cluster_id, sessionid) + + Why this matters: + - data isolation between customers + - isolation between environments + - allows filtering dashboards by customer+cluster with predictable performance + + ----------------------------------------------------------------------------------------------------- + Operational Notes + ----------------------------------------------------------------------------------------------------- + + • Refreshable MVs (REFRESH EVERY ...) are used to periodically recompute recent results. This gives + near-real-time dashboards without paying heavy costs on every query. + + • "Late arriving events": + We assume some events can arrive late (ingestion delays, client retries, network issues). Therefore: + - session_states is updated incrementally in real-time + - session_facts is re-finalized for sessions active in a rolling time window (e.g., last 72h) + - daily rollups are recomputed for last N days (e.g., 90 days) + + • IMPORTANT: + The heuristics for device and browser bucketing should be validated against your UA parser output. + You can refine the classification later without changing the pipeline structure. + +===================================================================================================== */ + +/* ===================================================================================================== + 1) SESSION STATES (INCREMENTAL, MERGEABLE) + ===================================================================================================== + + Object: Table + Name: clickhouse_test_db.session_states + Engine: AggregatingMergeTree + Updated by: session_states_mv (real-time MV on events inserts) + + Purpose: + - Incrementally aggregates events into per-session "states" that can be merged efficiently. Initially, + you get multiple partial rows for the same session key. However, at query time (or during background + merges), ClickHouse merges them, so N partial rows become 1 correct session. + - This enables: + • real-time incremental ingestion + • late arriving event correction (because new states merge in) + • avoiding expensive GROUP BY on raw events repeatedly + • Inserts that stay fast: + • MV does simple aggregation on the batch + • no global coordination + • no locking + - Instead of recomputing sessions from raw events every 15 mins, we continuously maintain aggregate + states per session using an MV. This is extremely scalable in ClickHouse because: + • inserts are append-only + • aggregation states merge efficiently during background merges + • your “session table” becomes tiny compared to raw events + - The table ENGINE set to 'AggregatingMergeTree' means: + • Rows are not final + • Multiple partial rows for the same session will exist. ClickHouse will merge them later + • Every column must be mergeable + • This is why you never query session_states directly for dashboards + - A mental analogy: Think of session_states as a shopping cart: + • items get added in multiple steps + • sometimes items arrive late + • the cart is not “final” until checkout + -> session_facts is the receipt. + + Key Grain (one logical session): + (customer_id, cluster_id, sessionid, context_site_id) + + Column strategy: + We store AggregateFunction(...) columns rather than finalized values. + Example: + - min_ts_state stores minState(_timestamp) + - later, minMerge(min_ts_state) finalizes the min timestamp across merges + + Why using argMaxState for dimensions? + - Session metadata can be emitted multiple times. + - We want a deterministic "latest value" for device/browser/language. + - argMaxState(value, _timestamp) keeps the value associated with the greatest timestamp. + +===================================================================================================== */ +CREATE TABLE clickhouse_test_db.session_states +( + /* Tenant boundaries */ + customer_id String, -- dotCMS customer / tenant identifier + cluster_id String, -- environment/cluster identifier (prod/stage/etc.) + /* We use argMax for context_site_id because session_states is mergeable, and ClickHouse needs an explicit rule + to deterministically resolve the value when partial session data is merged. */ + context_site_id_state AggregateFunction(argMax, String, DateTime64(3, 'UTC')), + context_site_id String, + /* Session boundary */ + sessionid String, -- unique session identifier; all events with same sessionid belong together + + /* Session time window (mergeable aggregate states) */ + min_ts_state AggregateFunction(min, DateTime64(3, 'UTC')), -- earliest event timestamp seen in session + max_ts_state AggregateFunction(max, DateTime64(3, 'UTC')), -- latest event timestamp seen in session + + /* Event counters (mergeable) */ + total_events_state AggregateFunction(count), -- total events in session + pageviews_state AggregateFunction(countIf, UInt8), -- number of pageview events in session + conversions_state AggregateFunction(countIf, UInt8), -- number of conversion events in session + + user_agent_state AggregateFunction(argMax, String, DateTime64(3, 'UTC')), + + /* Dimension "last known value" states (mergeable) */ + device_category_state AggregateFunction(argMax, String, DateTime64(3, 'UTC')), + -- last-seen device category label for the session (Desktop/Mobile/Tablet/Other) + -- derived from UA fields; stored as state so that late events can update the final value deterministically. + + browser_family_state AggregateFunction(argMax, String, DateTime64(3, 'UTC')), + -- last-seen browser family bucket (Chrome/Safari/Firefox/Edge/Other) + + language_id_state AggregateFunction(argMax, String, DateTime64(3, 'UTC')) + -- last-seen dotCMS language id (as String), defaulting to '0' if unknown +) + /* Why this engine is mandatory: + -> You are storing aggregate states + -> You rely on merge correctness + -> Without replication, different replicas would compute different session states */ + /*ENGINE = ReplicatedAggregatingMergeTree( + '/clickhouse/tables/{shard}/session_states', + '{replica}' + )*/ + ENGINE = AggregatingMergeTree + /* Partitioning note: + We partition by a hash of (customer, cluster) to spread writes and merges. + This avoids a single giant partition for big tenants and keeps merges parallelizable. */ + PARTITION BY sipHash64(customer_id, cluster_id) % 64 + /* Note for Sort key: + ORDER BY includes tenant + session to keep session states physically clustered for merges/finalization. + This also ensures stable grouping keys for session_facts refresh queries. */ + ORDER BY ( + customer_id, + cluster_id, + sessionid, + context_site_id); + + +/* ===================================================================================================== + Device categorization + ===================================================================================================== */ +/* + Object: Table + Name: clickhouse_test_db.device_category_map + Engine: MergeTree + Used by: session_states_mv (during ingestion) + + This is a tiny dimension table that maps normalized UA parser outputs into a device category bucket. This + table is meant to classify using normalized keys we already trust: + + - parsed_ua_device_family (dev_key) + - parsed_ua_os_family (os_key) + + Those fields come from the UA parser and are typically already tokenized into a clean family name (e.g., iPhone, + iPad, Windows, Mac OS X, etc.). After lowerUTF8(...), those become stable join keys. Think of this table as a + trusted translation dictionary: + + • Your UA parser gives you tokens like “iphone”, “ipad”, “windows”. + • This table translates those tokens into friendly categories like “Mobile”, “Tablet”, “Desktop”. + • It’s clean, deterministic, and fast — like looking up a word in a dictionary. + */ +CREATE TABLE IF NOT EXISTS clickhouse_test_db.device_category_map +( + -- What we match on (pick one: device_family, os_family, ua_family, etc.) + -- For performance and simplicity, we store normalized lowercase keys. + match_key String, -- e.g. 'iphone', 'android', 'ipad', 'smarttv', 'windows' + device_category String -- e.g. 'Mobile', 'Tablet', 'Smart TV', 'Desktop', 'Other' +) + ENGINE = MergeTree + ORDER BY match_key; + +/* Initial preload of the most common devices inside their respective categories */ +INSERT INTO clickhouse_test_db.device_category_map VALUES + ('smarttv', 'Smart TV'), + ('smart tv', 'Smart TV'), + ('tizen', 'Smart TV'), + ('webos', 'Smart TV'), + ('hbbtv', 'Smart TV'), + ('roku', 'Smart TV'), + ('appletv', 'Smart TV'), + ('apple tv', 'Smart TV'), + + ('ipad', 'Tablet'), + ('tablet', 'Tablet'), + ('kindle', 'Tablet'), + + ('iphone', 'Mobile'), + ('ios', 'Mobile'), + ('android', 'Mobile'), + ('pixel', 'Mobile'), + ('samsung', 'Mobile'), + + ('windows', 'Desktop'), + ('mac', 'Desktop'), + ('macos', 'Desktop'), + ('mac os x', 'Desktop'), + ('os x', 'Desktop'), + ('linux', 'Desktop'); + +/* + Object: Table + Name: clickhouse_test_db.device_category_fallback_rules + Engine: MergeTree + Used by: session_facts_rmv (during session finalization, not ingestion) + + Device fallback rules are set for when the base classification is too generic (“Desktop/Other”) and you want + a last-resort rescue by scanning the raw UA string. This table is a small set of priority-ordered fallback + heuristics used only when the base classification is too generic, and you still have the raw user-agent string + available. This is valuable when: + + - the UA parser doesn’t recognize the device family cleanly; i.e., it fails to identify Smart TVs/consoles cleanly + - or collapses unknown stuff into generic families + - or you see real-world UAs where the relevant signal is only in the raw string + */ +CREATE TABLE IF NOT EXISTS clickhouse_test_db.device_category_fallback_rules +( + priority UInt16, -- lower = evaluated first + category String, -- e.g. 'Smart TV', 'Tablet', 'Mobile' + patterns Array(String) -- case-insensitive substrings +) + ENGINE = MergeTree + ORDER BY priority; + +/* ⚠️ Keep this table small. Think “last-resort heuristics”, not full UA parsing. */ +INSERT INTO clickhouse_test_db.device_category_fallback_rules VALUES + (10, 'Smart TV', ['smarttv','smart tv','tizen','webos','hbbtv','roku','appletv']), + (20, 'Tablet', ['ipad','tablet','kindle']), + (30, 'Mobile', ['iphone','android','mobile']), + (90, 'Desktop', ['windows','macintosh','linux']), + (15, 'Game Console', ['playstation','xbox','nintendo','switch']); + + +/* ===================================================================================================== + Browser categorization + ===================================================================================================== */ + +/* + Object: Table + Name: clickhouse_test_db.browser_family_map + Engine: MergeTree + Used by: session_states_mv (during ingestion) + + This is a small lookup table that maps a normalized UA parser “browser family” token into a stable, dashboard-friendly + browser bucket. This table is meant to classify using normalized keys we already trust: + + - parsed_ua_ua_family (browser_key) + + Those fields come from the UA parser and are typically already tokenized into a clean family name (e.g., Safari, + Chrome, Firefox, etc.). After lowerUTF8(...), those become stable join keys. + */ +CREATE TABLE IF NOT EXISTS clickhouse_test_db.browser_family_map +( + match_key String, -- normalized lowercase + browser_family String -- 'Chrome','Safari','Firefox','Edge','Other' +) + ENGINE = MergeTree + ORDER BY match_key; + +/* Initial preload of the most common browsers inside their respective categories */ +INSERT INTO clickhouse_test_db.browser_family_map VALUES + ('chrome', 'Chrome'), + ('chromium', 'Chrome'), + ('headlesschrome', 'Chrome'), + ('crios', 'Chrome'), + ('safari', 'Safari'), + ('mobile safari', 'Safari'), + ('firefox', 'Firefox'), + ('fxios', 'Firefox'), + ('edge', 'Edge'), + ('microsoft edge', 'Edge'), + ('edg', 'Edge'); + +/* + Object: Table + Name: clickhouse_test_db.browser_family_fallback_rules + Engine: MergeTree + Used by: session_facts_rmv (during session finalization) + + Browser fallback rules. There are used only in `session_facts_rmv`, not during ingestion. They are + applied once, at session finalization time. This table defines priority-ordered heuristic rules to + classify a browser by scanning the raw UA string, but only when the base classification is too generic. + Sometimes the browser family token from the UA parser is unhelpful: + + - it’s missing + - it’s normalized into something too generic + - or it maps to Other even though the UA string clearly contains a recognizable signature + + So this table lets you “rescue” classification only when needed, without making ingestion heavy. + */ +CREATE TABLE IF NOT EXISTS clickhouse_test_db.browser_family_fallback_rules +( + priority UInt16, + category String, + patterns Array(String) +) + ENGINE = MergeTree + ORDER BY priority; + +/* ⚠️ Keep this table small. Think “last-resort heuristics”, not full UA parsing. */ +INSERT INTO clickhouse_test_db.browser_family_fallback_rules VALUES + (10, 'Edge', ['edg','edge']), + (20, 'Chrome', ['headlesschrome/','chrome','chromium','crios']), + (30, 'Safari', ['safari']), + (40, 'Firefox', ['firefox']), + (90, 'Other', ['opera','ucbrowser']); + + +/* ===================================================================================================== + 1.1) Real-time MV: events → session_states + ===================================================================================================== + + Object: Materialized View (incremental, real-time) + Name: clickhouse_test_db.session_states_mv + Pattern: CREATE MATERIALIZED VIEW ... TO session_states AS SELECT ... + Source: clickhouse_test_db.events + Target: clickhouse_test_db.session_states + + Purpose: + - Runs on every insert to events table. + - It takes only the NEWLY INSERTED BATCH and turns it into mergeable aggregate-state rows that get + appended into session_states. + - Writes those states into session_states where they will be merged with prior states. + - This MV is the engine that makes the whole pipeline scalable. Without it, you’d be forced to compute + sessions by scanning events with a big GROUP BY whenever you need session metrics (slow and expensive + at scale). + + Key guarantee: + - Correctness over time: if more events arrive for a session later, the session_states row merges in. + + Device bucketing: + This is a heuristic based on the UA parsing fields you have: + - Tablet if device family suggests ipad/tablet + - Mobile if OS suggests iOS/Android or device hints phone/android + - Desktop otherwise + You can refine this as you validate UA outputs. + + Browser bucketing: + - normalizes UA family into a stable set of display-friendly buckets + - unrecognized → Other + + Language id: + - uses `userlanguage` + - replace this expression if language id comes from a different field in your tracking payload + + - Automatically replicated + - No engine change required + - They write into replicated tables → safe + +===================================================================================================== */ +CREATE MATERIALIZED VIEW clickhouse_test_db.session_states_mv + TO clickhouse_test_db.session_states +AS +/* ------------------------------------------------------------------------------------------------- + Tables-only enrichment: + - device_category_map and browser_family_map are tiny dimension tables in ClickHouse + - We LEFT JOIN to them using normalized lowercase keys + - We keep the same “last known value” strategy with argMaxState(..., _timestamp) +------------------------------------------------------------------------------------------------- */ +WITH + lowerUTF8(parsed_ua_device_family) AS device_key, + lowerUTF8(parsed_ua_os_family) AS os_key, + lowerUTF8(parsed_ua_ua_family) AS browser_key, + + /* Prefer device_family mapping, else os_family mapping, else Desktop */ + coalesce(nullIf(d_dev.device_category, ''), nullIf(d_os.device_category, ''), 'Desktop') AS device_category, + + /* Prefer browser mapping, else Other */ + coalesce(nullIf(b_map.browser_family, ''), 'Other') AS browser_family, + + nullIf(userlanguage, '') AS language_id +SELECT + e.customer_id, + e.cluster_id, + e.context_site_id, + e.sessionid, + + /* Session time window states */ + minState(e._timestamp) AS min_ts_state, + maxState(e._timestamp) AS max_ts_state, + + /* Session-scoped ownership */ + argMaxState(e.context_site_id, e._timestamp) AS context_site_id_state, + + /* Event counters states */ + countState() AS total_events_state, + countIfState(e.event_type = 'pageview') AS pageviews_state, + countIfState(e.event_type = 'conversion') AS conversions_state, + + /* Session UA stored as state (needed for fallback later) */ + argMaxState(e.user_agent, e._timestamp) AS user_agent_state, + + /* "last seen" dimension states (tables-only values) */ + argMaxState(device_category, e._timestamp) AS device_category_state, + argMaxState(browser_family, e._timestamp) AS browser_family_state, + argMaxState(coalesce(language_id, '0'), e._timestamp) AS language_id_state +FROM clickhouse_test_db.events AS e + /* Device mapping via table */ + LEFT JOIN clickhouse_test_db.device_category_map AS d_dev + ON d_dev.match_key = device_key + LEFT JOIN clickhouse_test_db.device_category_map AS d_os + ON d_os.match_key = os_key + /* Browser mapping via table */ + LEFT JOIN clickhouse_test_db.browser_family_map AS b_map + ON b_map.match_key = browser_key +WHERE e.sessionid != '' + AND e.customer_id != '' + AND e.cluster_id != '' + AND e.context_site_id != '' +GROUP BY ( + e.customer_id, + e.cluster_id, + e.context_site_id, + e.sessionid); + + +/* ===================================================================================================== + 2) SESSION FACTS (FINALIZED SNAPSHOT) + ===================================================================================================== + + Object: Table + Name: clickhouse_test_db.session_facts + Engine: ReplacingMergeTree(updated_at) + Written by: session_facts_rmv (refreshable MV that finalizes session_states) + Role in pipeline: “finalized session snapshot” — one row per session, query-friendly + + Purpose: + - This is the first table in the pipeline that is meant to be read directly by downstream rollups and + dashboards.Stores one finalized row per session with plain scalar values (per session aggregates). + - This table represents “what happened in a session”, including duration, event counts, and the engaged + flag. This gives you a stable, query-friendly “session dimension” with everything you need for engagement + metrics. + - This is the "source of truth" for session-level analytics and is the input for daily rollups. + + Why using ReplacingMergeTree? + - Because we periodically "re-finalize" the same sessions as late events arrive. + - Each refresh writes a newer version (updated_at). + - ReplacingMergeTree keeps only the latest version during merges. + + IMPORTANT: + - When verifying correctness, do NOT rely on FINAL in production queries. Use FINAL only for + debugging/sanity checks. + - For normal queries, the latest version will be used by merges/reads over time. + +===================================================================================================== */ +CREATE TABLE clickhouse_test_db.session_facts +( + /* Tenant scope */ + customer_id String, + cluster_id String, + context_site_id String, + /* Session identity */ + sessionid String, + + /* Finalized session times */ + session_start DateTime64(3, 'UTC'), -- earliest event timestamp + session_end DateTime64(3, 'UTC'), -- latest event timestamp + duration_seconds UInt32, -- session_end - session_start (seconds) + + /* Finalized counters */ + total_events UInt32, -- total events in session + pageviews UInt32, -- pageview events + conversions UInt32, -- conversion events + + /* Engagement flag (GA4-style) */ + engaged UInt8, -- 1 if engaged, else 0 + + /* Finalized dimensions */ + device_category String, -- Desktop/Mobile/Tablet/Other + browser_family String, -- Chrome/Safari/Firefox/Edge/Other + language_id String, -- dotCMS language id as String ('0' unknown) + + /* Row version timestamp for ReplacingMergeTree */ + updated_at DateTime('UTC') +) + /* Why this matters: + -> Late events cause re-finalization + -> Multiple versions of the same session will exist temporarily + -> Replication guarantees: + -> all replicas converge to the same “latest” row + -> dashboards don’t disagree depending on which replica is queried */ + /*ENGINE = ReplicatedReplacingMergeTree( + '/clickhouse/tables/{shard}/session_facts', + '{replica}', + updated_at + )*/ + ENGINE = ReplacingMergeTree + /* Partition by month of session_start. Keeps partitions time-bounded and supports TTL strategies + later if desired. */ + PARTITION BY toYYYYMM(toDate(session_start)) + /* Sort key includes session identity for deterministic replacement. */ + ORDER BY ( + customer_id, + cluster_id, + context_site_id, + sessionid); + + +/* ===================================================================================================== + 2.1) Refreshable MV: session_states → session_facts + ===================================================================================================== + + Object: Refreshable Materialized View (RMV) + Name: clickhouse_test_db.session_facts_rmv + Type: REFRESH EVERY 15 MINUTE … TO session_facts + Source: clickhouse_test_db.session_states + Target: clickhouse_test_db.session_facts + + Purpose: + - This RMV is the finalizer: it takes “mergeable session state fragments” from session_states and + periodically produces (or re-produces) the latest finalized snapshot of each session in session_facts. + It’s not triggered by inserts; it runs on a schedule. + - Applies the engagement rules to compute engaged = 1 (engaged) or 0 (not engaged). + - Recompute “recent sessions” every 15 minutes. + - Important design choice: do not recompute “all history” every 15 minutes. Instead, recompute a + sliding window that covers late-arriving events + sessions that are still “open”. A very typical + window is 48 hours (tune as needed). We're setting 72 hours back for now. + + Rolling window (start_cutoff): + - We only finalize sessions whose session_end is in the last 72 hours, which can be increased as long as + the REFRESH EVERY value is increased as well. + - This bounds cost and still corrects for late arriving events. + - For older late events, a nightly "re-finalize last 30 days" job can be added in the future. + + Engagement rules (session is engaged if any): + - duration > 10 seconds + - pageviews >= 2 + - conversions >= 1 + +===================================================================================================== */ +CREATE MATERIALIZED VIEW clickhouse_test_db.session_facts_rmv + REFRESH EVERY 15 MINUTE + TO clickhouse_test_db.session_facts +AS +WITH + (now() - INTERVAL 72 HOUR) AS start_cutoff, + + /* Load fallback rules for device and browser once (small arrays, evaluated per session row) */ + ( + SELECT arraySort(x -> x.1, groupArray((priority, category, patterns))) + FROM clickhouse_test_db.device_category_fallback_rules + ) AS device_rules_sorted, + + ( + SELECT arraySort(x -> x.1, groupArray((priority, category, patterns))) + FROM clickhouse_test_db.browser_family_fallback_rules + ) AS browser_rules_sorted +SELECT + customer_id, + cluster_id, + context_site_id, + sessionid, + + session_start, + session_end, + duration_seconds, + + total_events, + pageviews, + conversions, + + engaged, + + /* FINAL device_category (map-first, UA-pattern fallback second) + * + * - device_category_base is derived from session_states_mv using table joins: + * device_category_map (device_family/os_family) → category, else 'Desktop' + * - We only apply fallback when base is generic ('Desktop' or 'Other') AND we have a UA string. + * - Fallback scans ordered rules (priority asc) and returns the first match. + * - If no fallback matches, keep base. + */ + if ((device_category_base IN ('Desktop', 'Other')) AND ua_l != '', + coalesce( + nullIf( + ifNull( + arrayFirst(r -> multiSearchAnyCaseInsensitive(ua_l, r.3) > 0, device_rules_sorted).2, + '' + ), + '' + ), device_category_base), + -- else + device_category_base + ) AS device_category, + + /* FINAL browser_family (map-first, UA-pattern fallback second) + * + * - browser_family_base is derived from session_states_mv using table joins: + * browser_family_map (ua_family) → family, else 'Other' + * - We only apply fallback when base resolves to 'Other' AND UA string is available. + * - Fallback is priority ordered and returns the first match. + * - If no fallback matches, keep base. + */ + if ((browser_family_base = 'Other') AND ua_l != '', + coalesce( + nullIf( + ifNull( + arrayFirst(r -> multiSearchAnyCaseInsensitive(ua_l, r.3) > 0, browser_rules_sorted).2, + '' + ), + '' + ), browser_family_base), + -- else + browser_family_base + ) AS browser_family, + + language_id, + + now() AS updated_at +FROM + ( + /* Aggregate session_states into finalized scalar columns */ + SELECT + customer_id, + cluster_id, + context_site_id, + sessionid, + + minMerge(min_ts_state) AS session_start, + maxMerge(max_ts_state) AS session_end, + + toUInt32(greatest(0, dateDiff('second', session_start, session_end))) AS duration_seconds, + + toUInt32(countMerge(total_events_state)) AS total_events, + toUInt32(countIfMerge(pageviews_state)) AS pageviews, + toUInt32(countIfMerge(conversions_state)) AS conversions, + + /* Business rules that determines whether a session is flagged as 'engaged' or not */ + toUInt8( + -- Sessions that last at least 10 seconds + (dateDiff('second', session_start, session_end) > 10) + -- Sessions that trigger at least 2 events of type 'pageview' + OR (countIfMerge(pageviews_state) >= 2) + -- Sessions that trigger at least 1 event of type 'conversion' + OR (countIfMerge(conversions_state) >= 1) + ) AS engaged, + + /* Base values now come from table-join mapping in session_states_mv */ + coalesce(nullIf(argMaxMerge(device_category_state), ''), 'Desktop') AS device_category_base, + coalesce(nullIf(argMaxMerge(browser_family_state), ''), 'Other') AS browser_family_base, + + /* UA for fallback matching */ + lowerUTF8(argMaxMerge(user_agent_state)) AS ua_l, + + argMaxMerge(language_id_state) AS language_id + FROM clickhouse_test_db.session_states + GROUP BY ( + customer_id, + cluster_id, + context_site_id, + sessionid) + HAVING session_end >= start_cutoff + ) s; + + +/* ===================================================================================================== + 3) DAILY ENGAGEMENT ROLLUP (DASHBOARD READY) + ===================================================================================================== + + Object: Table + Name: clickhouse_test_db.engagement_daily + Engine: ReplacingMergeTree(updated_at) + Written by: engagement_daily_rmv (refreshable MV) + Read by: CubeJS + dashboard KPI cards / trend charts + + Purpose: + - Stores daily aggregates derived from session_facts. It's intentionally small and dashboard-friendly. + - This table is meant to serve the dashboard fast (engagement rate, trend, average interactions, + average session time, conversion rate) without scanning sessions. + - This is the primary table queried by KPI cards and trend charts. + - It avoids scanning session_facts for common dashboards. + + Why store daily sums? + - For any time range, correct results are computed as: + sum(numerator) / sum(denominator) + NOT average of daily rates. + - Having daily sums allows accurate ratios for arbitrary date ranges. Dashboards frequently query arbitrary + date ranges (e.g., “last 28 days”, “Jan 10–Feb 2”). If you store daily rates (like daily engagement%), + then combining days correctly becomes tricky (you must weight by denominators). Instead, you store daily + numerators and denominators, so any date-range query can compute accurate rates. + + Grain: + (customer_id, cluster_id, context_site_id, day) + + Metrics included: + - total_sessions + - engaged_sessions + - engaged_conversion_sessions + - total_events_all + - total_duration_all + - total_events_engaged + - total_duration_engaged + +===================================================================================================== */ +CREATE TABLE clickhouse_test_db.engagement_daily +( + customer_id String, + cluster_id String, + context_site_id String, + day Date, + + total_sessions UInt64, -- count of all sessions + engaged_sessions UInt64, -- count of engaged sessions + engaged_conversion_sessions UInt64, -- engaged sessions that include >=1 conversion + + total_events_all UInt64, -- sum(total_events) across all sessions + total_duration_all UInt64, -- sum(duration_seconds) across all sessions + + total_events_engaged UInt64, -- sum(total_events) across engaged sessions only + total_duration_engaged UInt64, -- sum(duration_seconds) across engaged sessions only + + updated_at DateTime('UTC') +) + /* Why replicate rollups? + -> Refreshable MVs rewrite rows + -> Each replica must agree on the final row version + -> Otherwise, engagement rates can differ between replicas */ + /*ENGINE = ReplicatedReplacingMergeTree( + '/clickhouse/tables/{shard}/engagement_daily', + '{replica}', + updated_at + )*/ + ENGINE = ReplacingMergeTree + PARTITION BY toYYYYMM(day) + ORDER BY (customer_id, cluster_id, context_site_id, day); + + +/* ===================================================================================================== + 3.1) Refreshable MV: session_facts → engagement_daily + ===================================================================================================== + + Object: Refreshable Materialized View (RMV) + Name: clickhouse_test_db.engagement_daily_rmv + Type: REFRESH EVERY 15 MINUTE … TO engagement_daily + Source: clickhouse_test_db.session_facts + Target: clickhouse_test_db.engagement_daily + + Purpose: + - Periodically recompute daily rollups for recent days from the finalized session snapshot table + (session_facts) and writes the results into the dashboard-ready rollup table (engagement_daily). + - Here we typically roll up a wider window (e.g., last 90 days) because session_facts is already + small and cheap to scan, we can afford to recompute aggregates for many days back — like 90 days — + every time the MV refreshes (much smaller than raw events). + - Cheap because session_facts is far smaller than events. + + Start window (start_day): + - Recompute last 90 days by default. + - Tune based on retention needs + cost tolerance. + - This makes daily rollups correct even if session_facts rows are updated by late events. + +===================================================================================================== */ +CREATE MATERIALIZED VIEW clickhouse_test_db.engagement_daily_rmv + REFRESH EVERY 15 MINUTE + TO clickhouse_test_db.engagement_daily +AS +WITH (today() - 90) AS start_day +SELECT + customer_id, + cluster_id, + context_site_id, + toDate(session_start) AS day, + + count() AS total_sessions, + countIf(engaged = 1) AS engaged_sessions, + countIf(engaged = 1 AND conversions >= 1) AS engaged_conversion_sessions, + + sum(total_events) AS total_events_all, + sum(duration_seconds) AS total_duration_all, + + sumIf(total_events, engaged = 1) AS total_events_engaged, + sumIf(duration_seconds, engaged = 1) AS total_duration_engaged, + + now() AS updated_at +FROM clickhouse_test_db.session_facts +WHERE toDate(session_start) >= start_day +GROUP BY ( + customer_id, + cluster_id, + context_site_id, + day); + + +/* ===================================================================================================== + 4) DIMENSION ROLLUPS (DEVICE / BROWSER / LANGUAGE) + ===================================================================================================== + + These rollups power the "distribution" widgets: + - Sessions by device + - Sessions by browser + - Sessions by language + + For each bucket (e.g., Desktop): + - show engaged_sessions (absolute count) + - show engaged% within bucket = engaged_sessions / total_sessions + - show avg engaged time = total_duration_engaged / engaged_sessions + + Therefore, each rollup stores BOTH: + - total_sessions (all sessions in bucket) + - engaged_sessions (subset) + - total_duration_engaged_seconds (duration sum for engaged subset) + + Grain: + (customer_id, cluster_id, context_site_id, day, {bucket_value}) + {bucket_value} = device, browser, or language ID + +===================================================================================================== */ + +/* ----------------------------------------------------------------------------------------------------- + 4.1) Sessions by Device (daily) +----------------------------------------------------------------------------------------------------- */ + +/* + Object: Table + Name: clickhouse_test_db.sessions_by_device_daily + Engine: ReplacingMergeTree(updated_at) + Written by: sessions_by_device_daily_rmv + Role: daily rollup for the “Sessions by Device” distribution widget + + Mental model: + Think of this table as a daily per-site scoreboard by device category. + Each row stores raw counts (total sessions, engaged sessions, engaged time) + for one device bucket on one day. The table is periodically rewritten from + session_facts to correct late-arriving data. Dashboards compute percentages + and averages from these raw totals instead of storing precomputed rates. +*/ +CREATE TABLE clickhouse_test_db.sessions_by_device_daily +( + customer_id String, + cluster_id String, + context_site_id String, + day Date, + + device_category String, -- Desktop/Mobile/Tablet/Other + + total_sessions UInt64, -- ALL sessions for this device_category + engaged_sessions UInt64, -- Engaged sessions for this device_category + total_duration_engaged_seconds UInt64, -- Sum(duration_seconds) for engaged sessions only + + updated_at DateTime('UTC') +) + /*ENGINE = ReplicatedReplacingMergeTree( + '/clickhouse/tables/{shard}/sessions_by_device_daily', + '{replica}', + updated_at + )*/ + ENGINE = ReplacingMergeTree + PARTITION BY toYYYYMM(day) + ORDER BY (customer_id, cluster_id, context_site_id, day, device_category); + +/* + Object: RMV + Name: clickhouse_test_db.sessions_by_device_daily_rmv + Type: REFRESH EVERY 15 MINUTE … TO sessions_by_device_daily + Source: session_facts + Target: sessions_by_device_daily + + Recomputes a bounded window (this script uses last 90 days via start_day) + */ +CREATE MATERIALIZED VIEW clickhouse_test_db.sessions_by_device_daily_rmv + REFRESH EVERY 15 MINUTE + TO clickhouse_test_db.sessions_by_device_daily +AS +WITH (today() - 90) AS start_day +SELECT + customer_id, + cluster_id, + context_site_id, + toDate(session_start) AS day, + device_category, + + count() AS total_sessions, + countIf(engaged = 1) AS engaged_sessions, + sumIf(duration_seconds, engaged = 1) AS total_duration_engaged_seconds, + + now() AS updated_at +FROM clickhouse_test_db.session_facts +WHERE toDate(session_start) >= start_day +GROUP BY ( + customer_id, + cluster_id, + context_site_id, + day, + device_category); + + +/* ----------------------------------------------------------------------------------------------------- + 4.2) Sessions by Browser (daily) +----------------------------------------------------------------------------------------------------- */ + +/* + Object: Table + Name: clickhouse_test_db.sessions_by_browser_daily + Engine: ReplacingMergeTree(updated_at) + Written by: sessions_by_browser_daily_rmv + Role: daily rollup for the “Sessions by Browser” distribution widget + + Mental model: + Think of this table as a daily per-site scoreboard by browser. + Each row stores raw counts (total sessions, engaged sessions, engaged time) + for one browser bucket on one day. The table is periodically rewritten from + session_facts to correct late-arriving data. Dashboards compute percentages + and averages from these raw totals instead of storing precomputed rates. +*/ +CREATE TABLE clickhouse_test_db.sessions_by_browser_daily +( + customer_id String, + cluster_id String, + context_site_id String, + day Date, + + browser_family String, -- Chrome/Safari/Firefox/Edge/Other + + total_sessions UInt64, + engaged_sessions UInt64, + total_duration_engaged_seconds UInt64, + + updated_at DateTime('UTC') +) + /*ENGINE = ReplicatedReplacingMergeTree( + '/clickhouse/tables/{shard}/sessions_by_browser_daily', + '{replica}', + updated_at + )*/ + ENGINE = ReplacingMergeTree + PARTITION BY toYYYYMM(day) + ORDER BY (customer_id, cluster_id, context_site_id, day, browser_family); + +/* + Object: RMV + Name: clickhouse_test_db.sessions_by_browser_daily_rmv + Type: REFRESH EVERY 15 MINUTE … TO sessions_by_browser_daily + + Same pattern as device: bounded window (e.g., last 90 days) + */ +CREATE MATERIALIZED VIEW clickhouse_test_db.sessions_by_browser_daily_rmv + REFRESH EVERY 15 MINUTE + TO clickhouse_test_db.sessions_by_browser_daily +AS +WITH (today() - 90) AS start_day +SELECT + customer_id, + cluster_id, + context_site_id, + toDate(session_start) AS day, + browser_family, + + count() AS total_sessions, + countIf(engaged = 1) AS engaged_sessions, + sumIf(duration_seconds, engaged = 1) AS total_duration_engaged_seconds, + + now() AS updated_at +FROM clickhouse_test_db.session_facts +WHERE toDate(session_start) >= start_day +GROUP BY ( + customer_id, + cluster_id, + context_site_id, + day, + browser_family); + + +/* ----------------------------------------------------------------------------------------------------- + 4.3) Sessions by Language (daily) +----------------------------------------------------------------------------------------------------- */ + +/* + Object: Table + Name: clickhouse_test_db.sessions_by_language_daily + Engine: ReplacingMergeTree(updated_at) + Written by: sessions_by_language_daily_rmv + Role: daily rollup for “Sessions by Language” distribution widget + + Mental model: + Think of this table as a daily per-site scoreboard by language. + Each row stores raw counts (total sessions, engaged sessions, engaged time) + for one language bucket on one day. The table is periodically rewritten from + session_facts to correct late-arriving data. Dashboards compute percentages + and averages from these raw totals instead of storing precomputed rates. +*/ +CREATE TABLE clickhouse_test_db.sessions_by_language_daily +( + customer_id String, + cluster_id String, + context_site_id String, + day Date, + + language_id String, -- dotCMS language id as String ('0' unknown) + + total_sessions UInt64, + engaged_sessions UInt64, + total_duration_engaged_seconds UInt64, + + updated_at DateTime('UTC') +) + /*ENGINE = ReplicatedReplacingMergeTree( + '/clickhouse/tables/{shard}/sessions_by_language_daily', + '{replica}', + updated_at + )*/ + ENGINE = ReplacingMergeTree + PARTITION BY toYYYYMM(day) + ORDER BY (customer_id, cluster_id, context_site_id, day, language_id); + +/* + Object: RMV + Name: clickhouse_test_db.sessions_by_language_daily_rmv + Type: REFRESH EVERY 15 MINUTE … TO sessions_by_language_daily + + Same recompute model: scan session_facts for last N days (90) + */ +CREATE MATERIALIZED VIEW clickhouse_test_db.sessions_by_language_daily_rmv + REFRESH EVERY 15 MINUTE + TO clickhouse_test_db.sessions_by_language_daily +AS +WITH (today() - 90) AS start_day +SELECT + customer_id, + cluster_id, + context_site_id, + toDate(session_start) AS day, + language_id, + + count() AS total_sessions, + countIf(engaged = 1) AS engaged_sessions, + sumIf(duration_seconds, engaged = 1) AS total_duration_engaged_seconds, + + now() AS updated_at +FROM clickhouse_test_db.session_facts +WHERE toDate(session_start) >= start_day +GROUP BY ( + customer_id, + cluster_id, + context_site_id, + day, + language_id);